Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c =================================================================== --- head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c (revision 175201) +++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c (revision 175202) @@ -1,220 +1,220 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include void kobj_free(void *address, size_t size) { kmem_free(address, size); } void * kobj_alloc(size_t size, int flag) { return (kmem_alloc(size, (flag & KM_NOWAIT) ? KM_NOSLEEP : KM_SLEEP)); } void * kobj_zalloc(size_t size, int flag) { void *p; if ((p = kobj_alloc(size, flag)) != NULL) bzero(p, size); return (p); } static void * kobj_open_file_vnode(const char *file) { struct thread *td = curthread; struct nameidata nd; int error, flags; if (td->td_proc->p_fd->fd_rdir == NULL) td->td_proc->p_fd->fd_rdir = rootvnode; if (td->td_proc->p_fd->fd_cdir == NULL) td->td_proc->p_fd->fd_cdir = rootvnode; flags = FREAD; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td); error = vn_open_cred(&nd, &flags, 0, td->td_ucred, NULL); NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) return (NULL); /* We just unlock so we hold a reference. */ VOP_UNLOCK(nd.ni_vp, 0, td); return (nd.ni_vp); } static void * kobj_open_file_loader(const char *file) { return (preload_search_by_name(file)); } struct _buf * kobj_open_file(const char *file) { struct _buf *out; out = kmem_alloc(sizeof(*out), KM_SLEEP); out->mounted = root_mounted(); /* * If root is already mounted we read file using file system, * if not, we use loader. */ if (out->mounted) out->ptr = kobj_open_file_vnode(file); else out->ptr = kobj_open_file_loader(file); if (out->ptr == NULL) { kmem_free(out, sizeof(*out)); return ((struct _buf *)-1); } return (out); } static int kobj_get_filesize_vnode(struct _buf *file, uint64_t *size) { struct vnode *vp = file->ptr; struct thread *td = curthread; struct vattr va; int error; - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); if (error == 0) *size = (uint64_t)va.va_size; return (error); } static int kobj_get_filesize_loader(struct _buf *file, uint64_t *size) { void *ptr; ptr = preload_search_info(file->ptr, MODINFO_SIZE); if (ptr == NULL) return (ENOENT); *size = (uint64_t)*(size_t *)ptr; return (0); } int kobj_get_filesize(struct _buf *file, uint64_t *size) { if (file->mounted) return (kobj_get_filesize_vnode(file, size)); else return (kobj_get_filesize_loader(file, size)); } int kobj_read_file_vnode(struct _buf *file, char *buf, unsigned size, unsigned off) { struct vnode *vp = file->ptr; struct thread *td = curthread; struct uio auio; struct iovec aiov; int error; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); aiov.iov_base = buf; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_offset = (off_t)off; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_iovcnt = 1; auio.uio_resid = size; auio.uio_td = td; - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_READ(vp, &auio, IO_UNIT | IO_SYNC, td->td_ucred); VOP_UNLOCK(vp, 0, td); return (error != 0 ? -1 : size - auio.uio_resid); } int kobj_read_file_loader(struct _buf *file, char *buf, unsigned size, unsigned off) { char *ptr; ptr = preload_search_info(file->ptr, MODINFO_ADDR); if (ptr == NULL) return (ENOENT); ptr = *(void **)ptr; bcopy(ptr + off, buf, size); return (0); } int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) { if (file->mounted) return (kobj_read_file_vnode(file, buf, size, off)); else return (kobj_read_file_loader(file, buf, size, off)); } void kobj_close_file(struct _buf *file) { if (file->mounted) { struct vnode *vp = file->ptr; struct thread *td = curthread; int flags = FREAD; vn_close(vp, flags, td->td_ucred, td); } kmem_free(file, sizeof(*file)); } Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c =================================================================== --- head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c (revision 175201) +++ head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c (revision 175202) @@ -1,280 +1,280 @@ /*- * Copyright (c) 2006-2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include MALLOC_DECLARE(M_MOUNT); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, int flags __unused) { struct vfsopt *opt; size_t namesize; if (vfsp->mnt_opt == NULL) { vfsp->mnt_opt = malloc(sizeof(*vfsp->mnt_opt), M_MOUNT, M_WAITOK); TAILQ_INIT(vfsp->mnt_opt); } opt = malloc(sizeof(*opt), M_MOUNT, M_WAITOK); namesize = strlen(name) + 1; opt->name = malloc(namesize, M_MOUNT, M_WAITOK); strlcpy(opt->name, name, namesize); if (arg == NULL) { opt->value = NULL; opt->len = 0; } else { opt->len = strlen(arg) + 1; opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(arg, opt->value, opt->len); } /* TODO: Locking. */ TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); } void vfs_clearmntopt(vfs_t *vfsp, const char *name) { struct vfsopt *opt; if (vfsp->mnt_opt == NULL) return; /* TODO: Locking. */ TAILQ_FOREACH(opt, vfsp->mnt_opt, link) { if (strcmp(opt->name, name) == 0) break; } if (opt != NULL) { TAILQ_REMOVE(vfsp->mnt_opt, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } } int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) { struct vfsoptlist *opts = vfsp->mnt_opt; int error; if (opts == NULL) return (0); error = vfs_getopt(opts, opt, (void **)argp, NULL); return (error != 0 ? 0 : 1); } int traverse(vnode_t **cvpp, int lktype) { kthread_t *td = curthread; vnode_t *cvp; vnode_t *tvp; vfs_t *vfsp; int error; cvp = *cvpp; tvp = NULL; /* * If this vnode is mounted on, then we transparently indirect * to the vnode which is the root of the mounted file system. * Before we do this we must check that an unmount is not in * progress on this vnode. */ for (;;) { /* * Reached the end of the mount chain? */ vfsp = vn_mountedvfs(cvp); if (vfsp == NULL) break; /* * tvp is NULL for *cvpp vnode, which we can't unlock. */ if (tvp != NULL) vput(cvp); else vrele(cvp); /* * The read lock must be held across the call to VFS_ROOT() to * prevent a concurrent unmount from destroying the vfs. */ error = VFS_ROOT(vfsp, lktype, &tvp, td); if (error != 0) return (error); cvp = tvp; } *cvpp = cvp; return (0); } int domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath, char *fspec, int fsflags) { struct mount *mp; struct vfsconf *vfsp; struct ucred *newcr, *oldcr; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (vp->v_type != VDIR) return (ENOTDIR); VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); /* * Allocate and initialize the filesystem. */ - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); mp = vfs_mount_alloc(vp, vfsp, fspath, td); VOP_UNLOCK(vp, 0, td); mp->mnt_optnew = NULL; vfs_setmntopt(mp, "from", fspec, 0); mp->mnt_optnew = mp->mnt_opt; mp->mnt_opt = NULL; /* * Set the mount level flags. * crdup() can sleep, so do it before acquiring a mutex. */ newcr = crdup(kcred); MNT_ILOCK(mp); if (fsflags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; mp->mnt_flag &=~ MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS); /* * Unprivileged user can trigger mounting a snapshot, but we don't want * him to unmount it, so we switch to privileged credentials. */ oldcr = mp->mnt_cred; mp->mnt_cred = newcr; mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid; MNT_IUNLOCK(mp); crfree(oldcr); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, td); if (!error) { if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; (void)VFS_STATFS(mp, &mp->mnt_stat, td); } /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Put the new filesystem on the mount list after root. */ #ifdef FREEBSD_NAMECACHE cache_purge(vp); #endif if (!error) { vnode_t *mvp; VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp, td)) panic("mount: lost mount"); mountcheckdirs(vp, mvp); vput(mvp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if (error) vrele(vp); else vfs_mountedfrom(mp, fspec); } else { VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0, td); vfs_unbusy(mp, td); vfs_mount_destroy(mp); } return (error); } Index: head/sys/cddl/compat/opensolaris/sys/vnode.h =================================================================== --- head/sys/cddl/compat/opensolaris/sys/vnode.h (revision 175201) +++ head/sys/cddl/compat/opensolaris/sys/vnode.h (revision 175202) @@ -1,268 +1,268 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_VNODE_H_ #define _OPENSOLARIS_SYS_VNODE_H_ #include_next #include #include #include #include #include #include #include typedef struct vnode vnode_t; typedef struct vattr vattr_t; typedef void caller_context_t; typedef struct vop_vector vnodeops_t; #define vop_fid vop_vptofh #define vop_fid_args vop_vptofh_args #define a_fid a_fhp #define v_count v_usecount static __inline int vn_is_readonly(vnode_t *vp) { return (vp->v_mount->mnt_flag & MNT_RDONLY); } #define vn_vfswlock(vp) (0) #define vn_vfsunlock(vp) do { } while (0) #define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) #define vn_mountedvfs(vp) ((vp)->v_mountedhere) #define vn_has_cached_data(vp) ((vp)->v_object != NULL && (vp)->v_object->resident_page_count > 0) #define VN_HOLD(v) vref(v) #define VN_RELE(v) vrele(v) #define VN_URELE(v) vput(v) #define VOP_REALVP(vp, vpp) (*(vpp) = (vp), 0) #define vnevent_remove(vp) do { } while (0) #define vnevent_rmdir(vp) do { } while (0) #define vnevent_rename_src(vp) do { } while (0) #define vnevent_rename_dest(vp) do { } while (0) #define IS_DEVVP(vp) \ ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) #define MODEMASK ALLPERMS #define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) #define MANDMODE(mode) (0) #define chklock(vp, op, offset, size, mode, ct) (0) #define cleanlocks(vp, pid, foo) do { } while (0) #define cleanshares(vp, pid) do { } while (0) /* * We will use va_spare is place of Solaris' va_mask. * This field is initialized in zfs_setattr(). */ #define va_mask va_spare /* TODO: va_fileid is shorter than va_nodeid !!! */ #define va_nodeid va_fileid /* TODO: This field needs conversion! */ #define va_nblocks va_bytes #define va_blksize va_blocksize #define va_seq va_gen #define MAXOFFSET_T OFF_MAX #define EXCL 0 #define AT_TYPE 0x0001 #define AT_MODE 0x0002 #define AT_UID 0x0004 #define AT_GID 0x0008 #define AT_FSID 0x0010 #define AT_NODEID 0x0020 #define AT_NLINK 0x0040 #define AT_SIZE 0x0080 #define AT_ATIME 0x0100 #define AT_MTIME 0x0200 #define AT_CTIME 0x0400 #define AT_RDEV 0x0800 #define AT_BLKSIZE 0x1000 #define AT_NBLOCKS 0x2000 #define AT_SEQ 0x4000 #define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) #define ACCESSED (AT_ATIME) #define STATE_CHANGED (AT_CTIME) #define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) static __inline void vattr_init_mask(vattr_t *vap) { vap->va_mask = 0; if (vap->va_type != VNON) vap->va_mask |= AT_TYPE; if (vap->va_uid != (uid_t)VNOVAL) vap->va_mask |= AT_UID; if (vap->va_gid != (gid_t)VNOVAL) vap->va_mask |= AT_GID; if (vap->va_size != (u_quad_t)VNOVAL) vap->va_mask |= AT_SIZE; if (vap->va_atime.tv_sec != VNOVAL) vap->va_mask |= AT_ATIME; if (vap->va_mtime.tv_sec != VNOVAL) vap->va_mask |= AT_MTIME; if (vap->va_mode != (u_short)VNOVAL) vap->va_mask |= AT_MODE; } #define FCREAT O_CREAT #define FTRUNC O_TRUNC #define FDSYNC FFSYNC #define FRSYNC FFSYNC #define FSYNC FFSYNC #define FOFFMAX 0x00 enum create { CRCREAT }; static __inline int zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, vnode_t **vpp, enum create crwhy, mode_t umask) { struct thread *td = curthread; struct nameidata nd; int error; ASSERT(seg == UIO_SYSSPACE); ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); ASSERT(crwhy == CRCREAT); ASSERT(umask == 0); if (td->td_proc->p_fd->fd_rdir == NULL) td->td_proc->p_fd->fd_rdir = rootvnode; if (td->td_proc->p_fd->fd_cdir == NULL) td->td_proc->p_fd->fd_cdir = rootvnode; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pnamep, td); error = vn_open_cred(&nd, &filemode, createmode, td->td_ucred, NULL); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == 0) { /* We just unlock so we hold a reference. */ VN_HOLD(nd.ni_vp); VOP_UNLOCK(nd.ni_vp, 0, td); *vpp = nd.ni_vp; } return (error); } #define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \ zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask)) #define RLIM64_INFINITY 0 static __inline int zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr, ssize_t *residp) { struct thread *td = curthread; int error, vfslocked, resid; ASSERT(rw == UIO_WRITE); ASSERT(ioflag == 0); ASSERT(ulimit == RLIM64_INFINITY); ioflag = IO_APPEND | IO_UNIT; vfslocked = VFS_LOCK_GIANT(vp->v_mount); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED, &resid, td); VFS_UNLOCK_GIANT(vfslocked); if (residp != NULL) *residp = (ssize_t)resid; return (error); } #define vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \ zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp)) static __inline int zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr) { struct thread *td = curthread; struct mount *mp; int error, vfslocked; ASSERT(flag == FSYNC); vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); return (error); } #define VOP_FSYNC(vp, flag, cr) zfs_vop_fsync((vp), (flag), (cr)) static __inline int zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { ASSERT(flag == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); ASSERT(count == 1); ASSERT(offset == 0); return (vn_close(vp, flag, cr, curthread)); } #define VOP_CLOSE(vp, oflags, count, offset, cr) \ zfs_vop_close((vp), (oflags), (count), (offset), (cr)) static __inline int vn_rename(char *from, char *to, enum uio_seg seg) { ASSERT(seg == UIO_SYSSPACE); return (kern_rename(curthread, from, to, seg)); } enum rm { RMFILE }; static __inline int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) { ASSERT(seg == UIO_SYSSPACE); ASSERT(dirflag == RMFILE); return (kern_unlink(curthread, fnamep, seg)); } #endif /* _OPENSOLARIS_SYS_VNODE_H_ */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c (revision 175201) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c (revision 175202) @@ -1,884 +1,884 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Portions Copyright 2007 Shivakumar GN */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Generic pseudo-filesystem routines. * * There are significant similarities between the implementation of certain file * system entry points across different filesystems. While one could attempt to * "choke up on the bat" and incorporate common functionality into a VOP * preamble or postamble, such an approach is limited in the benefit it can * provide. In this file we instead define a toolkit of routines which can be * called from a filesystem (with in-kernel pseudo-filesystems being the focus * of the exercise) in a more component-like fashion. * * There are three basic classes of routines: * * 1) Lowlevel support routines * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, * without enforcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() * gfs_readdir_emit() * gfs_readdir_emitn() * gfs_readdir_pred() * gfs_readdir_fini() * gfs_lookup_dot() * * 2) Complete GFS management * * These routines take a more active role in management of the * pseudo-filesystem. They handle the relationship between vnode private * data and VFS data, as well as the relationship between vnodes in the * directory hierarchy. * * In order to use these interfaces, the first member of every private * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control * to GFS. * * gfs_file_create() * gfs_dir_create() * gfs_root_create() * * gfs_file_inactive() * gfs_dir_inactive() * gfs_dir_lookup() * gfs_dir_readdir() * * gfs_vop_inactive() * gfs_vop_lookup() * gfs_vop_readdir() * gfs_vop_map() * * 3) Single File pseudo-filesystems * * This routine creates a rooted file to be overlayed ontop of another * file in the physical filespace. * * Note that the parent is NULL (actually the vfs), but there is nothing * technically keeping such a file from utilizing the "Complete GFS * management" set of routines. * * gfs_root_create_file() */ /* * Low level directory routines * * These routines provide some simple abstractions for reading directories. * They are designed to be used by existing pseudo filesystems (namely procfs) * that already have a complicated management infrastructure. */ /* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length * ureclen - the exported file-space record length (1 for non-legacy FSs) * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode * * Returns 0 or a non-zero errno. * * Typical VOP_READDIR usage of gfs_readdir_*: * * if ((error = gfs_readdir_init(...)) != 0) * return (error); * eof = 0; * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { * if (!consumer_entry_at(voffset)) * voffset = consumer_next_entry(voffset); * if (consumer_eof(voffset)) { * eof = 1 * break; * } * if ((error = gfs_readdir_emit(..., voffset, * consumer_ino(voffset), consumer_name(voffset))) != 0) * break; * } * return (gfs_readdir_fini(..., error, eofp, eof)); * * As you can see, a zero result from gfs_readdir_pred() or * gfs_readdir_emit() indicates that processing should continue, * whereas a non-zero result indicates that the loop should terminate. * Most consumers need do nothing more than let gfs_readdir_fini() * determine what the cause of failure was and return the appropriate * value. */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, uio_t *uiop, ino64_t parent, ino64_t self) { if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); st->grd_parent = parent; st->grd_self = self; return (0); } /* * gfs_readdir_emit_int: internal routine to emit directory entry * * st - the current readdir state, which must have d_ino and d_name * set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ static int gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, int *ncookies, u_long **cookies) { int reclen, namlen; namlen = strlen(st->grd_dirent->d_name); reclen = DIRENT64_RECLEN(namlen); if (reclen > uiop->uio_resid) { /* * Error if no entries were returned yet */ if (uiop->uio_resid == st->grd_oresid) return (EINVAL); return (-1); } /* XXX: This can change in the future. */ st->grd_dirent->d_type = DT_DIR; st->grd_dirent->d_reclen = (ushort_t)reclen; st->grd_dirent->d_namlen = namlen; if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); uiop->uio_loffset = next; if (*cookies != NULL) { **cookies = next; (*cookies)++; (*ncookies)--; KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies)); } return (0); } /* * gfs_readdir_emit: emit a directory entry * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, ino64_t ino, const char *name, int *ncookies, u_long **cookies) { offset_t off = (voff + 2) * st->grd_ureclen; st->grd_dirent->d_ino = ino; (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); /* * Inter-entry offsets are invalid, so we assume a record size of * grd_ureclen and explicitly set the offset appropriately. */ return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies, cookies)); } /* * gfs_readdir_pred: readdir loop predicate * voffp - a pointer in which the next virtual offset should be stored * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp, int *ncookies, u_long **cookies) { offset_t off, voff; int error; top: if (uiop->uio_resid <= 0) return (-1); off = uiop->uio_loffset / st->grd_ureclen; voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, ".", ncookies, cookies)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, "..", ncookies, cookies)) == 0) goto top; } else { *voffp = voff; return (0); } return (error); } /* * gfs_readdir_fini: generic readdir cleanup * error - if positive, an error to return * eofp - the eofp passed to readdir * eof - the eof value * * Returns a 0 on success, a non-zero errno on failure. This result * should be returned from readdir. */ int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); if (error > 0) return (error); if (eofp) *eofp = eof; return (0); } /* * gfs_lookup_dot * * Performs a basic check for "." and ".." directory entries. */ int gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) { if (*nm == '\0' || strcmp(nm, ".") == 0) { VN_HOLD(dvp); *vpp = dvp; return (0); } else if (strcmp(nm, "..") == 0) { if (pvp == NULL) { ASSERT(dvp->v_flag & VROOT); VN_HOLD(dvp); *vpp = dvp; } else { VN_HOLD(pvp); *vpp = pvp; } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (0); } return (-1); } /* * gfs_file_create(): create a new GFS file * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * * In order to use this interface, the parent vnode must have been created by * gfs_dir_create(), and the private data stored in v_data must have a * 'gfs_file_t' as its first field. * * Given these constraints, this routine will automatically: * * - Allocate v_data for the vnode * - Initialize necessary fields in the vnode * - Hold the parent */ vnode_t * gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops) { gfs_file_t *fp; vnode_t *vp; int error; /* * Allocate vnode and internal data structure */ fp = kmem_zalloc(size, KM_SLEEP); error = getnewvnode("zfs", vfsp, ops, &vp); ASSERT(error == 0); vp->v_data = (caddr_t)fp; /* * Set up various pointers */ fp->gfs_vnode = vp; fp->gfs_parent = pvp; fp->gfs_size = size; fp->gfs_type = GFS_FILE; error = insmntque(vp, vfsp); KASSERT(error == 0, ("insmntque() failed: error %d", error)); /* * Initialize vnode and hold parent. */ if (pvp) VN_HOLD(pvp); return (vp); } /* * gfs_dir_create: creates a new directory in the parent * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * entries - NULL-terminated list of static entries (if any) * maxlen - maximum length of a directory entry * readdir_cb - readdir callback (see gfs_dir_readdir) * inode_cb - inode callback (see gfs_dir_readdir) * lookup_cb - lookup callback (see gfs_dir_lookup) * * In order to use this function, the first member of the private vnode * structure (v_data) must be a gfs_dir_t. For each directory, there are * static entries, defined when the structure is initialized, and dynamic * entries, retrieved through callbacks. * * If a directory has static entries, then it must supply a inode callback, * which will compute the inode number based on the parent and the index. * For a directory with dynamic entries, the caller must supply a readdir * callback and a lookup callback. If a static lookup fails, we fall back to * the supplied lookup callback, if any. * * This function also performs the same initialization as gfs_file_create(). */ vnode_t * gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; gfs_dir_t *dp; gfs_dirent_t *de; vp = gfs_file_create(struct_size, pvp, vfsp, ops); vp->v_type = VDIR; dp = vp->v_data; dp->gfsd_file.gfs_type = GFS_DIR; dp->gfsd_maxlen = maxlen; if (entries != NULL) { for (de = entries; de->gfse_name != NULL; de++) dp->gfsd_nstatic++; dp->gfsd_static = kmem_alloc( dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); bcopy(entries, dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } dp->gfsd_readdir = readdir_cb; dp->gfsd_lookup = lookup_cb; dp->gfsd_inode = inode_cb; mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); return (vp); } /* * gfs_root_create(): create a root vnode for a GFS filesystem * * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The * only difference is that it takes a vfs_t instead of a vnode_t as its parent. */ vnode_t * gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; VFS_HOLD(vfsp); vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb, maxlen, readdir_cb, lookup_cb); /* Manually set the inode */ ((gfs_file_t *)vp->v_data)->gfs_ino = ino; vp->v_flag |= VROOT; return (vp); } /* * gfs_file_inactive() * * Called from the VOP_INACTIVE() routine. If necessary, this routine will * remove the given vnode from the parent directory and clean up any references * in the VFS layer. * * If the vnode was not removed (due to a race with vget), then NULL is * returned. Otherwise, a pointer to the private data is returned. */ void * gfs_file_inactive(vnode_t *vp) { int i; gfs_dirent_t *ge = NULL; gfs_file_t *fp = vp->v_data; gfs_dir_t *dp = NULL; void *data; if (fp->gfs_parent == NULL) goto found; dp = fp->gfs_parent->v_data; /* * First, see if this vnode is cached in the parent. */ gfs_dir_lock(dp); /* * Find it in the set of static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (ge->gfse_vnode == vp) goto found; } /* * If 'ge' is NULL, then it is a dynamic entry. */ ge = NULL; found: VI_LOCK(vp); ASSERT(vp->v_count < 2); /* * Really remove this vnode */ data = vp->v_data; if (ge != NULL) { /* * If this was a statically cached entry, simply set the * cached vnode to NULL. */ ge->gfse_vnode = NULL; } if (vp->v_count == 1) { vp->v_usecount--; vdropl(vp); } else { VI_UNLOCK(vp); } /* * Free vnode and release parent */ if (fp->gfs_parent) { gfs_dir_unlock(dp); VI_LOCK(fp->gfs_parent); fp->gfs_parent->v_usecount--; VI_UNLOCK(fp->gfs_parent); } else { ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } return (data); } /* * gfs_dir_inactive() * * Same as above, but for directories. */ void * gfs_dir_inactive(vnode_t *vp) { gfs_dir_t *dp; ASSERT(vp->v_type == VDIR); if ((dp = gfs_file_inactive(vp)) != NULL) { mutex_destroy(&dp->gfsd_lock); if (dp->gfsd_nstatic) kmem_free(dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } return (dp); } /* * gfs_dir_lookup() * * Looks up the given name in the directory and returns the corresponding vnode, * if found. * * First, we search statically defined entries, if any. If a match is found, * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the * existing vnode. Otherwise, we call the static entry's callback routine, * caching the result if necessary. * * If no static entry is found, we invoke the lookup callback, if any. The * arguments to this callback are: * * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode * * Returns 0 on success, non-zero on error. */ int gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) { int i; gfs_dirent_t *ge; vnode_t *vp; gfs_dir_t *dp = dvp->v_data; int ret = 0; ASSERT(dvp->v_type == VDIR); if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) return (0); gfs_dir_lock(dp); /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (strcmp(ge->gfse_name, nm) == 0) { if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); goto out; } /* * We drop the directory lock, as the constructor will * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set * for this entry, we discard the result in favor of the * cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); gfs_dir_lock(dp); ((gfs_file_t *)vp->v_data)->gfs_index = i; /* Set the inode according to the callback. */ ((gfs_file_t *)vp->v_data)->gfs_ino = dp->gfsd_inode(dvp, i); if (ge->gfse_flags & GFS_CACHE_VNODE) { if (ge->gfse_vnode == NULL) { ge->gfse_vnode = vp; } else { /* * A parallel constructor beat us to it; * return existing vnode. We have to be * careful because we can't release the * current vnode while holding the * directory lock; its inactive routine * will try to lock this directory. */ vnode_t *oldvp = vp; vp = ge->gfse_vnode; VN_HOLD(vp); gfs_dir_unlock(dp); VN_RELE(oldvp); gfs_dir_lock(dp); } } goto out; } } /* * See if there is a dynamic constructor. */ if (dp->gfsd_lookup) { ino64_t ino; gfs_file_t *fp; /* * Once again, drop the directory lock, as the lookup routine * will need to allocate memory, or otherwise deadlock on this * directory. */ gfs_dir_unlock(dp); ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); gfs_dir_lock(dp); if (ret != 0) goto out; fp = (gfs_file_t *)vp->v_data; fp->gfs_index = -1; fp->gfs_ino = ino; } else { /* * No static entry found, and there is no lookup callback, so * return ENOENT. */ ret = ENOENT; } out: gfs_dir_unlock(dp); if (ret == 0) *vpp = vp; else *vpp = NULL; return (ret); } /* * gfs_dir_readdir: does a readdir() on the given directory * * dvp - directory vnode * uiop - uio structure * eofp - eof pointer * data - arbitrary data passed to readdir callback * * This routine does all the readdir() dirty work. Even so, the caller must * supply two callbacks in order to get full compatibility. * * If the directory contains static entries, an inode callback must be * specified. This avoids having to create every vnode and call VOP_GETATTR() * when reading the directory. This function has the following arguments: * * ino_t gfs_inode_cb(vnode_t *vp, int index); * * vp - vnode for the directory * index - index in original gfs_dirent_t array * * Returns the inode number for the given entry. * * For directories with dynamic entries, a readdir callback must be provided. * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, * offset_t *off, offset_t *nextoff, void *data) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and * d_ino. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left * untouched. * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, u_long **cookies, void *data) { gfs_readdir_state_t gstate; int error, eof = 0; ino64_t ino, pino; offset_t off, next; gfs_dir_t *dp = dvp->v_data; ino = dp->gfsd_file.gfs_ino; if (dp->gfsd_file.gfs_parent == NULL) pino = ino; /* root of filesystem */ else pino = ((gfs_file_t *) (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, pino, ino)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, cookies)) == 0 && !eof) { if (off >= 0 && off < dp->gfsd_nstatic) { ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, off, ino, dp->gfsd_static[off].gfse_name, ncookies, cookies)) != 0) break; } else if (dp->gfsd_readdir) { off -= dp->gfsd_nstatic; if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, data)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; next += dp->gfsd_nstatic + 2; if ((error = gfs_readdir_emit_int(&gstate, uiop, next, ncookies, cookies)) != 0) break; } else { /* * Offset is beyond the end of the static entries, and * we have no dynamic entries. Set EOF. */ eof = 1; } } return (gfs_readdir_fini(&gstate, error, eofp, eof)); } /* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls * gfs_dir_readdir() as necessary. */ /* ARGSUSED */ int gfs_vop_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; } */ *ap; { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; int *eofp = ap->a_eofflag; int ncookies = 0; u_long *cookies = NULL; int error; if (ap->a_ncookies) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); if (error == 0) { /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; } else if (ap->a_ncookies) { free(*ap->a_cookies, M_TEMP); *ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } /* * gfs_vop_inactive: VOP_INACTIVE() entry point * * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. */ /* ARGSUSED */ int gfs_vop_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; gfs_file_t *fp = vp->v_data; void *data; if (fp->gfs_type == GFS_DIR) data = gfs_dir_inactive(vp); else data = gfs_file_inactive(vp); if (data != NULL) kmem_free(data, fp->gfs_size); vp->v_data = NULL; return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c (revision 175201) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c (revision 175202) @@ -1,1119 +1,1119 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' directory, but this may expand in the * future. The elements are built using the GFS primitives, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding vnode. * * All mounts are handled automatically by the kernel, but unmounts are * (currently) handled from user land. The main reason is that there is no * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. */ #include #include #include #include #include #include #include #include #include typedef struct { char *se_name; vnode_t *se_root; avl_node_t se_node; } zfs_snapentry_t; static int snapentry_compare(const void *a, const void *b) { const zfs_snapentry_t *sa = a; const zfs_snapentry_t *sb = b; int ret = strcmp(sa->se_name, sb->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } static struct vop_vector zfsctl_ops_root; static struct vop_vector zfsctl_ops_snapdir; static struct vop_vector zfsctl_ops_snapshot; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); typedef struct zfsctl_node { gfs_dir_t zc_gfs_private; uint64_t zc_id; timestruc_t zc_cmtime; /* ctime and mtime, always the same */ } zfsctl_node_t; typedef struct zfsctl_snapdir { zfsctl_node_t sd_node; kmutex_t sd_lock; avl_tree_t sd_snaps; } zfsctl_snapdir_t; /* * Root directory elements. We have only a single static entry, 'snapshot'. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, { NULL } }; /* include . and .. in the calculation */ #define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ sizeof (gfs_dirent_t)) + 1) /* * Initialize the various GFS pieces we'll need to create and manipulate .zfs * directories. This is called from the ZFS init routine, and initializes the * vnode ops vectors that we'll be using. */ void zfsctl_init(void) { } void zfsctl_fini(void) { } /* * Return the inode number associated with the 'snapshot' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { ASSERT(index == 0); return (ZFSCTL_INO_SNAPDIR); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the vfs_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. */ void zfsctl_create(zfsvfs_t *zfsvfs) { vnode_t *vp, *rvp; zfsctl_node_t *zcp; ASSERT(zfsvfs->z_ctldir == NULL); vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = ZFSCTL_INO_ROOT; VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0); ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); VN_URELE(rvp); /* * We're only faking the fact that we have a root of a filesystem for * the sake of the GFS interfaces. Undo the flag manipulation it did * for us. */ vp->v_vflag &= ~VV_ROOT; zfsvfs->z_ctldir = vp; } /* * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. * There might still be more references if we were force unmounted, but only * new zfs_inactive() calls can occur and they don't reference .zfs */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { VN_RELE(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ vnode_t * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); VN_HOLD(zp->z_zfsvfs->z_ctldir); return (zp->z_zfsvfs->z_ctldir); } /* * Common open routine. Disallow any write access. */ /* ARGSUSED */ static int zfsctl_common_open(struct vop_open_args *ap) { int flags = ap->a_mode; if (flags & FWRITE) return (EACCES); return (0); } /* * Common close routine. Nothing to do here. */ /* ARGSUSED */ static int zfsctl_common_close(struct vop_close_args *ap) { return (0); } /* * Common access routine. Disallow writes. */ /* ARGSUSED */ static int zfsctl_common_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { int mode = ap->a_mode; if (mode & VWRITE) return (EACCES); return (0); } /* * Common getattr function. Fill in basic information. */ static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { zfsctl_node_t *zcp = vp->v_data; timestruc_t now; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* * We are a purly virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; vap->va_nblocks = 0; vap->va_seq = 0; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; vap->va_type = VDIR; /* * We live in the now (for atime). */ gethrestime(&now); vap->va_atime = now; vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime; /* FreeBSD: Reset chflags(2) flags. */ vap->va_flags = 0; } static int zfsctl_common_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { vnode_t *vp = ap->a_vp; fid_t *fidp = (void *)ap->a_fid; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_node_t *zcp = vp->v_data; uint64_t object = zcp->zc_id; zfid_short_t *zfid; int i; ZFS_ENTER(zfsvfs); fidp->fid_len = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; ZFS_EXIT(zfsvfs); return (0); } static int zfsctl_common_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); VI_LOCK(vp); vp->v_data = NULL; VI_UNLOCK(vp); return (0); } /* * .zfs inode namespace * * We need to generate unique inode numbers for all files and directories * within the .zfs pseudo-filesystem. We use the following scheme: * * ENTRY ZFSCTL_INODE * .zfs 1 * .zfs/snapshot 2 * .zfs/snapshot/ objectid(snap) */ #define ZFSCTL_INO_SNAP(id) (id) /* * Get root directory attributes. */ /* ARGSUSED */ static int zfsctl_root_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; ZFS_ENTER(zfsvfs); vap->va_nodeid = ZFSCTL_INO_ROOT; vap->va_nlink = vap->va_size = NROOT_ENTRIES; zfsctl_common_getattr(vp, vap); ZFS_EXIT(zfsvfs); return (0); } /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; ZFS_ENTER(zfsvfs); if (strcmp(nm, "..") == 0) { err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread); if (err == 0) VOP_UNLOCK(*vpp, 0, curthread); } else { err = gfs_dir_lookup(dvp, nm, vpp); } ZFS_EXIT(zfsvfs); return (err); } /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_root_lookup_vop(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; cred_t *cr = ap->a_cnp->cn_cred; int flags = ap->a_cnp->cn_flags; int nameiop = ap->a_cnp->cn_nameiop; char nm[NAME_MAX + 1]; int err; if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) return (EOPNOTSUPP); ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr); if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (err); } static struct vop_vector zfsctl_ops_root = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_root_getattr, .vop_access = zfsctl_common_access, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_root_lookup_vop, .vop_inactive = gfs_vop_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, }; static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; dmu_objset_name(os, zname); if (strlen(zname) + 1 + strlen(name) >= len) return (ENAMETOOLONG); (void) strcat(zname, "@"); (void) strcat(zname, name); return (0); } static int zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr) { zfsctl_snapdir_t *sdp = dvp->v_data; zfs_snapentry_t search, *sep; struct vop_inactive_args ap; avl_index_t where; int err; ASSERT(MUTEX_HELD(&sdp->sd_lock)); search.se_name = (char *)name; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) return (ENOENT); ASSERT(vn_ismntpt(sep->se_root)); /* this will be dropped by dounmount() */ if ((err = vn_vfswlock(sep->se_root)) != 0) return (err); err = dounmount(vn_mountedvfs(sep->se_root), force, curthread); if (err) return (err); ASSERT(sep->se_root->v_count == 1); ap.a_vp = sep->se_root; gfs_vop_inactive(&ap); avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); return (0); } #if 0 static void zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) { avl_index_t where; vfs_t *vfsp; refstr_t *pathref; char newpath[MAXNAMELEN]; char *tail; ASSERT(MUTEX_HELD(&sdp->sd_lock)); ASSERT(sep != NULL); vfsp = vn_mountedvfs(sep->se_root); ASSERT(vfsp != NULL); vfs_lock_wait(vfsp); /* * Change the name in the AVL tree. */ avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); avl_insert(&sdp->sd_snaps, sep, where); /* * Change the current mountpoint info: * - update the tail of the mntpoint path * - update the tail of the resource path */ pathref = vfs_getmntpoint(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '/')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setmntpoint(vfsp, newpath); pathref = vfs_getresource(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '@')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setresource(vfsp, newpath); vfs_unlock(vfsp); } #endif #if 0 static int zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) { zfsctl_snapdir_t *sdp = sdvp->v_data; zfs_snapentry_t search, *sep; avl_index_t where; char from[MAXNAMELEN], to[MAXNAMELEN]; int err; err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); if (err) return (err); err = zfs_secpolicy_write(from, cr); if (err) return (err); /* * Cannot move snapshots out of the snapdir. */ if (sdvp != tdvp) return (EINVAL); if (strcmp(snm, tnm) == 0) return (0); err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); if (err) return (err); mutex_enter(&sdp->sd_lock); search.se_name = (char *)snm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { mutex_exit(&sdp->sd_lock); return (ENOENT); } err = dmu_objset_rename(from, to, B_FALSE); if (err == 0) zfsctl_rename_snap(sdp, sep, tnm); mutex_exit(&sdp->sd_lock); return (err); } #endif #if 0 /* ARGSUSED */ static int zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) { zfsctl_snapdir_t *sdp = dvp->v_data; char snapname[MAXNAMELEN]; int err; err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); if (err) return (err); err = zfs_secpolicy_write(snapname, cr); if (err) return (err); mutex_enter(&sdp->sd_lock); err = zfsctl_unmount_snap(dvp, name, 0, cr); if (err) { mutex_exit(&sdp->sd_lock); return (err); } err = dmu_objset_destroy(snapname); mutex_exit(&sdp->sd_lock); return (err); } #endif /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem vnode as necessary. * Perform a mount of the associated dataset on top of the vnode. */ /* ARGSUSED */ int zfsctl_snapdir_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; char nm[NAME_MAX + 1]; zfsctl_snapdir_t *sdp = dvp->v_data; objset_t *snap; char snapname[MAXNAMELEN]; char *mountpoint; zfs_snapentry_t *sep, search; size_t mountpoint_len; avl_index_t where; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); ASSERT(dvp->v_type == VDIR); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) return (0); *vpp = NULL; /* * If we get a recursive call, that means we got called * from the domount() code while it was trying to look up the * spec (which looks like a local path for zfs). We need to * add some flag to domount() to tell it not to do this lookup. */ if (MUTEX_HELD(&sdp->sd_lock)) return (ENOENT); ZFS_ENTER(zfsvfs); mutex_enter(&sdp->sd_lock); search.se_name = (char *)nm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { *vpp = sep->se_root; VN_HOLD(*vpp); if ((*vpp)->v_mountedhere == NULL) { /* * The snapshot was unmounted behind our backs, * try to remount it. */ goto domount; } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (0); } /* * The requested snapshot is not currently mounted, look it up. */ err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); if (err) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (err); } if (dmu_objset_open(snapname, DMU_OST_ZFS, DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (ENOENT); } sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); VN_HOLD(*vpp); avl_insert(&sdp->sd_snaps, sep, where); dmu_objset_close(snap); domount: mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + strlen("/.zfs/snapshot/") + strlen(nm) + 1; mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, nm); err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0); kmem_free(mountpoint, mountpoint_len); /* FreeBSD: This line was moved from below to avoid a lock recursion. */ if (err == 0) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); mutex_exit(&sdp->sd_lock); /* * If we had an error, drop our hold on the vnode and * zfsctl_snapshot_inactive() will clean up. */ if (err) { VN_RELE(*vpp); *vpp = NULL; } return (err); } /* ARGSUSED */ static int zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, offset_t *offp, offset_t *nextp, void *data) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; char snapname[MAXNAMELEN]; uint64_t id, cookie; ZFS_ENTER(zfsvfs); cookie = *offp; if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &cookie) == ENOENT) { *eofp = 1; ZFS_EXIT(zfsvfs); return (0); } (void) strcpy(dp->d_name, snapname); dp->d_ino = ZFSCTL_INO_SNAP(id); *nextp = cookie; ZFS_EXIT(zfsvfs); return (0); } vnode_t * zfsctl_mknode_snapdir(vnode_t *pvp) { vnode_t *vp; zfsctl_snapdir_t *sdp; vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, zfsctl_snapdir_readdir_cb, NULL); sdp = vp->v_data; sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&sdp->sd_snaps, snapentry_compare, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); return (vp); } /* ARGSUSED */ static int zfsctl_snapdir_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_snapdir_t *sdp = vp->v_data; ZFS_ENTER(zfsvfs); zfsctl_common_getattr(vp, vap); vap->va_nodeid = gfs_file_inode(vp); vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfsctl_snapdir_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfsctl_snapdir_t *sdp = vp->v_data; void *private; private = gfs_dir_inactive(vp); if (private != NULL) { ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); mutex_destroy(&sdp->sd_lock); avl_destroy(&sdp->sd_snaps); kmem_free(private, sizeof (zfsctl_snapdir_t)); } return (0); } static struct vop_vector zfsctl_ops_snapdir = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_snapdir_getattr, .vop_access = zfsctl_common_access, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_snapdir_lookup, .vop_inactive = zfsctl_snapdir_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, }; static vnode_t * zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) { vnode_t *vp; zfsctl_node_t *zcp; vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = objset; return (vp); } static int zfsctl_snapshot_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; struct vop_inactive_args iap; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int locked; vnode_t *dvp; VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0); sdp = dvp->v_data; VOP_UNLOCK(dvp, 0, ap->a_td); if (!(locked = MUTEX_HELD(&sdp->sd_lock))) mutex_enter(&sdp->sd_lock); if (vp->v_count > 1) { if (!locked) mutex_exit(&sdp->sd_lock); return (0); } ASSERT(!vn_ismntpt(vp)); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { next = AVL_NEXT(&sdp->sd_snaps, sep); if (sep->se_root == vp) { avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); break; } sep = next; } ASSERT(sep != NULL); if (!locked) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); /* * Dispose of the vnode for the snapshot mount point. * This is safe to do because once this entry has been removed * from the AVL tree, it can't be found again, so cannot become * "active". If we lookup the same name again we will end up * creating a new vnode. */ iap.a_vp = vp; return (gfs_vop_inactive(&iap)); } static int zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td) { VN_HOLD(*vpp); /* Snapshot should be already mounted, but just in case. */ if (vn_mountedvfs(*vpp) == NULL) return (ENOENT); return (traverse(vpp, lktype)); } static void zfsctl_traverse_end(vnode_t *vp, int err) { if (err == 0) vput(vp); else VN_RELE(vp); } static int zfsctl_snapshot_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; int err; err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td); if (err == 0) err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td); zfsctl_traverse_end(vp, err); return (err); } static int zfsctl_snapshot_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { vnode_t *vp = ap->a_vp; int err; err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread); if (err == 0) err = VOP_VPTOFH(vp, (void *)ap->a_fid); zfsctl_traverse_end(vp, err); return (err); } /* * These VP's should never see the light of day. They should always * be covered. */ static struct vop_vector zfsctl_ops_snapshot = { .vop_default = &default_vnodeops, .vop_inactive = zfsctl_snapshot_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_getattr = zfsctl_snapshot_getattr, .vop_fid = zfsctl_snapshot_fid, }; int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp, *vp; zfsctl_snapdir_t *sdp; zfsctl_node_t *zcp; zfs_snapentry_t *sep; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, kcred); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { vp = sep->se_root; zcp = vp->v_data; if (zcp->zc_id == objsetid) break; sep = AVL_NEXT(&sdp->sd_snaps, sep); } if (sep != NULL) { VN_HOLD(vp); error = traverse(&vp, LK_SHARED | LK_RETRY); if (error == 0) { if (vp == sep->se_root) error = EINVAL; else *zfsvfsp = VTOZ(vp)->z_zfsvfs; } mutex_exit(&sdp->sd_lock); if (error == 0) VN_URELE(vp); else VN_RELE(vp); } else { error = EINVAL; mutex_exit(&sdp->sd_lock); } VN_RELE(dvp); return (error); } /* * Unmount any snapshots for the given filesystem. This is called from * zfs_umount() - if we have a ctldir, then go through and unmount all the * snapshots. */ int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { struct vop_inactive_args ap; zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp, *svp; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, cr); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { svp = sep->se_root; next = AVL_NEXT(&sdp->sd_snaps, sep); /* * If this snapshot is not mounted, then it must * have just been unmounted by somebody else, and * will be cleaned up by zfsctl_snapdir_inactive(). */ if (vn_ismntpt(svp)) { if ((error = vn_vfswlock(svp)) != 0) goto out; /* * Increase usecount, so dounmount() won't vrele() it * to 0 and call zfsctl_snapdir_inactive(). */ VN_HOLD(svp); vfsp = vn_mountedvfs(svp); mtx_lock(&Giant); error = dounmount(vfsp, fflags, curthread); mtx_unlock(&Giant); if (error != 0) { VN_RELE(svp); goto out; } avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); /* * We can't use VN_RELE(), as that will try to * invoke zfsctl_snapdir_inactive(), and that * would lead to an attempt to re-grab the sd_lock. */ ASSERT3U(svp->v_count, ==, 1); ap.a_vp = svp; gfs_vop_inactive(&ap); } sep = next; } out: mutex_exit(&sdp->sd_lock); VN_RELE(dvp); return (error); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c (revision 175201) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c (revision 175202) @@ -1,430 +1,430 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Functions to replay ZFS intent log (ZIL) records * The functions are called through a function vector (zfs_replay_vector) * which is indexed by the transaction type. */ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { VATTR_NULL(vap); vap->va_mask = (uint_t)mask; vap->va_type = IFTOVT(mode); vap->va_mode = mode & MODEMASK; vap->va_uid = (uid_t)uid; vap->va_gid = (gid_t)gid; vap->va_rdev = zfs_cmpldev(rdev); vap->va_nodeid = nodeid; } /* ARGSUSED */ static int zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) { return (ENOTSUP); } static int zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) { char *name = (char *)(lr + 1); /* name follows lr_create_t */ char *link; /* symlink content follows name */ znode_t *dzp; vnode_t *vp = NULL; vattr_t va; struct componentname cn; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time and generation number. The generic VOP_CREATE() * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime); va.va_nblocks = lr->lr_gen; cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch ((int)lr->lr_common.lrc_txtype) { case TX_CREATE: error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va); break; case TX_MKDIR: error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va); break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &va, &vp, kcred); break; case TX_SYMLINK: link = name + strlen(name) + 1; error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link); break; default: error = ENOTSUP; } VOP_UNLOCK(ZTOV(dzp), 0, curthread); if (error == 0 && vp != NULL) { VOP_UNLOCK(vp, 0, curthread); VN_RELE(vp); } VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) { char *name = (char *)(lr + 1); /* name follows lr_remove_t */ znode_t *dzp; struct componentname cn; vnode_t *vp; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); bzero(&cn, sizeof(cn)); cn.cn_nameptr = name; cn.cn_namelen = strlen(name); cn.cn_nameiop = DELETE; cn.cn_flags = ISLASTCN | SAVENAME; cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cn.cn_cred = kcred; cn.cn_thread = curthread; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); if (error != 0) { VOP_UNLOCK(ZTOV(dzp), 0, curthread); goto fail; } switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: error = VOP_REMOVE(ZTOV(dzp), vp, &cn); break; case TX_RMDIR: error = VOP_RMDIR(ZTOV(dzp), vp, &cn); break; default: error = ENOTSUP; } vput(vp); VOP_UNLOCK(ZTOV(dzp), 0, curthread); fail: VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) { char *name = (char *)(lr + 1); /* name follows lr_link_t */ znode_t *dzp, *zp; struct componentname cn; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { VN_RELE(ZTOV(dzp)); return (error); } cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); - vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn); VOP_UNLOCK(ZTOV(zp), 0, curthread); VOP_UNLOCK(ZTOV(dzp), 0, curthread); VN_RELE(ZTOV(zp)); VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) { char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; struct componentname scn, tcn; vnode_t *svp, *tvp; kthread_t *td = curthread; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { VN_RELE(ZTOV(sdzp)); return (error); } svp = tvp = NULL; bzero(&scn, sizeof(scn)); scn.cn_nameptr = sname; scn.cn_namelen = strlen(sname); scn.cn_nameiop = DELETE; scn.cn_flags = ISLASTCN | SAVENAME; scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; scn.cn_cred = kcred; scn.cn_thread = td; - vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); VOP_UNLOCK(ZTOV(sdzp), 0, td); if (error != 0) goto fail; VOP_UNLOCK(svp, 0, td); bzero(&tcn, sizeof(tcn)); tcn.cn_nameptr = tname; tcn.cn_namelen = strlen(tname); tcn.cn_nameiop = RENAME; tcn.cn_flags = ISLASTCN | SAVENAME; tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; tcn.cn_cred = kcred; tcn.cn_thread = td; - vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK(ZTOV(tdzp), 0, td); goto fail; } error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn); return (error); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); VN_RELE(ZTOV(tdzp)); VN_RELE(ZTOV(sdzp)); return (error); } static int zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) { char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; ssize_t resid; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); VN_RELE(ZTOV(zp)); return (error); } static int zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) { ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); return (EOPNOTSUPP); } static int zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) { znode_t *zp; vattr_t va; vnode_t *vp; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log setattrs out of order, it's possible the * file has been removed. In this case just drop the setattr * and return success. */ if (error == ENOENT) error = 0; return (error); } zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); va.va_size = lr->lr_size; ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime); ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime); vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, &va, kcred, curthread); VOP_UNLOCK(vp, 0, curthread); VN_RELE(vp); return (error); } static int zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) { ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ #ifdef TODO vsecattr_t vsa; #endif znode_t *zp; int error; if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_aclcnt); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log acls out of order, it's possible the * file has been removed. In this case just drop the acl * and return success. */ if (error == ENOENT) error = 0; return (error); } #ifdef TODO bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentp = ace; error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred); #else error = EOPNOTSUPP; #endif VN_RELE(ZTOV(zp)); return (error); } /* * Callback vectors for replaying records */ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_error, /* 0 no such transaction type */ zfs_replay_create, /* TX_CREATE */ zfs_replay_create, /* TX_MKDIR */ zfs_replay_create, /* TX_MKXATTR */ zfs_replay_create, /* TX_SYMLINK */ zfs_replay_remove, /* TX_REMOVE */ zfs_replay_remove, /* TX_RMDIR */ zfs_replay_link, /* TX_LINK */ zfs_replay_rename, /* TX_RENAME */ zfs_replay_write, /* TX_WRITE */ zfs_replay_truncate, /* TX_TRUNCATE */ zfs_replay_setattr, /* TX_SETATTR */ zfs_replay_acl, /* TX_ACL */ }; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 175201) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 175202) @@ -1,1021 +1,1021 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_debug_level = 0; TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, "Debug level"); static int zfs_mount(vfs_t *vfsp, kthread_t *td); static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); static struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_fhtovp = zfs_fhtovp, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; int error; error = vfs_stdsync(vfsp, waitfor, td); if (error != 0) return (error); ZFS_ENTER(zfsvfs); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval < SPA_MINBLOCKSIZE || newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) newval = SPA_MAXBLOCKSIZE; zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->vfs_bsize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_refresh_properties(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Remount operations default to "rw" unless "ro" is explicitly * specified. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { readonly_changed_cb(zfsvfs, B_TRUE); } else { if (!dmu_objset_is_snapshot(zfsvfs->z_os)) readonly_changed_cb(zfsvfs, B_FALSE); else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) return (EROFS); } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid_changed_cb(zfsvfs, B_FALSE); } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) setuid_changed_cb(zfsvfs, B_FALSE); else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) setuid_changed_cb(zfsvfs, B_TRUE); } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) exec_changed_cb(zfsvfs, B_FALSE); else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) exec_changed_cb(zfsvfs, B_TRUE); if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) atime_changed_cb(zfsvfs, B_TRUE); else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) atime_changed_cb(zfsvfs, B_FALSE); if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) xattr_changed_cb(zfsvfs, B_TRUE); else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) xattr_changed_cb(zfsvfs, B_FALSE); return (0); } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; int readonly, do_readonly = FALSE; int setuid, do_setuid = FALSE; int exec, do_exec = FALSE; int xattr, do_xattr = FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "xattr", xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "recordsize", blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "readonly", readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "setuid", setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "exec", exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclmode", acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); return (0); unregister: /* * We may attempt to unregister some callbacks that are not * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); return (error); } static int zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) { cred_t *cr = td->td_ucred; uint64_t recordsize, readonly; int error = 0; int mode; zfsvfs_t *zfsvfs; znode_t *zp = NULL; ASSERT(vfsp); ASSERT(osname); /* * Initialize the zfs-specific filesystem structure. * Should probably make this a kmem cache, shuffle fields, * and just bzero up to z_hold_mtx[]. */ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_vfs = vfsp; zfsvfs->z_parent = zfsvfs; zfsvfs->z_assign = TXG_NOWAIT; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_MPSAFE; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) goto out; if (readonly) mode = DS_MODE_PRIMARY | DS_MODE_READONLY; else mode = DS_MODE_PRIMARY; error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); if (error == EROFS) { mode = DS_MODE_PRIMARY | DS_MODE_READONLY; error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); } if (error) goto out; if (error = zfs_init_fs(zfsvfs, &zp, cr)) goto out; if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t xattr; ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL)) goto out; xattr_changed_cb(zfsvfs, xattr); zfsvfs->z_issnap = B_TRUE; } else { error = zfs_register_callbacks(vfsp); if (error) goto out; zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. */ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector); if (!zil_disable) zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { if (zfsvfs->z_os) dmu_objset_close(zfsvfs->z_os); rw_destroy(&zfsvfs->z_um_lock); mutex_destroy(&zfsvfs->z_znodes_lock); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } else { atomic_add_32(&zfs_active_fs_count, 1); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; struct dsl_dataset *ds; /* * Unregister properties. */ if (!dmu_objset_is_snapshot(os)) { ds = dmu_objset_ds(os); VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); } } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp, kthread_t *td) { char *from; int error; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) return (zfs_refresh_properties(vfsp)); if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); DROP_GIANT(); error = zfs_domount(vfsp, from, td); PICKUP_GIANT(); return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = zfsvfs->z_vfs->vfs_bsize; statp->f_iosize = zfsvfs->z_vfs->vfs_bsize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = ZFS_MAXNAMELEN; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) { *vpp = ZTOV(rootzp); - error = vn_lock(*vpp, flags, td); + error = vn_lock(*vpp, flags); (*vpp)->v_vflag |= VV_ROOT; } ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; cred_t *cr = td->td_ucred; int ret; if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) return (ret); (void) dnlc_purge_vfsp(vfsp, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) return (EBUSY); ASSERT(zfsvfs->z_ctldir->v_count == 1); } zfsctl_destroy(zfsvfs); ASSERT(zfsvfs->z_ctldir == NULL); } /* * Flush all the files. */ ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); ASSERT(zfsvfs->z_ctldir != NULL); } return (ret); } if (fflag & MS_FORCE) { MNT_ILOCK(vfsp); vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(vfsp); zfsvfs->z_unmounted1 = B_TRUE; /* * Wait for all zfs threads to leave zfs. * Grabbing a rwlock as reader in all vops and * as writer here doesn't work because it too easy to get * multiple reader enters as zfs can re-enter itself. * This can lead to deadlock if there is an intervening * rw_enter as writer. * So a file system threads ref count (z_op_cnt) is used. * A polling loop on z_op_cnt may seem inefficient, but * - this saves all threads on exit from having to grab a * mutex in order to cv_signal * - only occurs on forced unmount in the rare case when * there are outstanding threads within the file system. */ while (zfsvfs->z_op_cnt) { delay(1); } } zfs_objset_close(zfsvfs); VFS_RELE(vfsp); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { VN_RELE(ZTOV(zp)); err = EINVAL; } if (err != 0) *vpp = NULL; else { *vpp = ZTOV(zp); - vn_lock(*vpp, flags, curthread); + vn_lock(*vpp, flags); } ZFS_EXIT(zfsvfs); return (err); } static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (EINVAL); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (EINVAL); } /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *vpp = zfsvfs->z_ctldir; ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 0, NULL, NULL) == 0); } else { VN_HOLD(*vpp); } ZFS_EXIT(zfsvfs); /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } zp_gen = zp->z_phys->zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); VN_RELE(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (EINVAL); } *vpp = ZTOV(zp); /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); vnode_create_vobject(*vpp, zp->z_phys->zp_size, td); ZFS_EXIT(zfsvfs); return (0); } static void zfs_objset_close(zfsvfs_t *zfsvfs) { znode_t *zp, *nextzp; objset_t *os = zfsvfs->z_os; /* * For forced unmount, at this point all vops except zfs_inactive * are erroring EIO. We need to now suspend zfs_inactive threads * while we are freeing dbufs before switching zfs_inactive * to use behaviour without a objset. */ rw_enter(&zfsvfs->z_um_lock, RW_WRITER); /* * Release all holds on dbufs * Note, although we have stopped all other vop threads and * zfs_inactive(), the dmu can callback via znode_pageout_func() * which can zfs_znode_free() the znode. * So we lock z_all_znodes; search the list for a held * dbuf; drop the lock (we know zp can't disappear if we hold * a dbuf lock; then regrab the lock and restart. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { nextzp = list_next(&zfsvfs->z_all_znodes, zp); if (zp->z_dbuf_held) { /* dbufs should only be held when force unmounting */ zp->z_dbuf_held = 0; mutex_exit(&zfsvfs->z_znodes_lock); dmu_buf_rele(zp->z_dbuf, NULL); /* Start again */ mutex_enter(&zfsvfs->z_znodes_lock); nextzp = list_head(&zfsvfs->z_all_znodes); } } mutex_exit(&zfsvfs->z_znodes_lock); /* * Unregister properties. */ if (!dmu_objset_is_snapshot(os)) zfs_unregister_callbacks(zfsvfs); /* * Switch zfs_inactive to behaviour without an objset. * It just tosses cached pages and frees the znode & vnode. * Then re-enable zfs_inactive threads in that new behaviour. */ zfsvfs->z_unmounted2 = B_TRUE; rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ /* * Close the zil. Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } /* * Evict all dbufs so that cached znodes will be freed */ if (dmu_objset_evict_dbufs(os, 1)) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); (void) dmu_objset_evict_dbufs(os, 0); } /* * Finally close the objset */ dmu_objset_close(os); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; int i; for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); rw_destroy(&zfsvfs->z_um_lock); mutex_destroy(&zfsvfs->z_znodes_lock); kmem_free(zfsvfs, sizeof (zfsvfs_t)); atomic_add_32(&zfs_active_fs_count, -1); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int val; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes == val) desiredvnodes = (3 * desiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version " ZFS_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); } void zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 175201) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 175202) @@ -1,3599 +1,3599 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Portions Copyright 2007 Jeremy Teo */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait the the intent log to commit if it's is a synchronous operation. * Morover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). * In normal operation, this will be TXG_NOWAIT. During ZIL replay, * it will be a specific txg. Either way, dmu_tx_assign() never blocks. * This is critical because we don't want to block while holding locks. * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing to * use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, seq, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr) { znode_t *zp = VTOZ(*vpp); /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { znode_t *zp = VTOZ(vp); /* Decrement the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_dec_32(&zp->z_sync_cnt); /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_phys->zp_size; if (noff >= file_sz) { return (ENXIO); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); /* end of file? */ if ((error == ESRCH) || (noff > file_sz)) { /* * Handle the virtual hole at the end of file. */ if (hole) { *off = file_sz; return (0); } return (ENXIO); } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp) { offset_t off; int error; zfsvfs_t *zfsvfs; switch (com) { case _FIOFFS: return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ case _FIOGDIO: case _FIOSDIO: return (0); case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (EFAULT); zfsvfs = VTOZ(vp)->z_zfsvfs; ZFS_ENTER(zfsvfs); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (EFAULT); return (0); } return (ENOTTY); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; vm_page_t m; struct sf_buf *sf; int64_t start, off; int len = nbytes; int error = 0; uint64_t dirbytes; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; dirbytes = 0; VM_OBJECT_LOCK(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { uint64_t bytes = MIN(PAGESIZE - off, len); uint64_t fsize; again: if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && vm_page_is_valid(m, (vm_offset_t)off, bytes)) { uint64_t woff; caddr_t va; if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) goto again; fsize = obj->un_pager.vnp.vnp_size; vm_page_busy(m); vm_page_lock_queues(); vm_page_undirty(m); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (dirbytes > 0) { error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); dirbytes = 0; } if (error == 0) { sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); woff = uio->uio_loffset - off; error = uiomove(va + off, bytes, UIO_WRITE, uio); /* * The uiomove() above could have been partially * successful, that's why we call dmu_write() * below unconditionally. The page was marked * non-dirty above and we would lose the changes * without doing so. If the uiomove() failed * entirely, well, we just write what we got * before one more time. */ dmu_write(os, zp->z_id, woff, MIN(PAGESIZE, fsize - woff), va, tx); sf_buf_free(sf); sched_unpin(); } VM_OBJECT_LOCK(obj); vm_page_wakeup(m); } else { dirbytes += bytes; } len -= bytes; off = 0; if (error) break; } VM_OBJECT_UNLOCK(obj); if (error == 0 && dirbytes > 0) error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; vm_page_t m; struct sf_buf *sf; int64_t start, off; caddr_t va; int len = nbytes; int error = 0; uint64_t dirbytes; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; dirbytes = 0; VM_OBJECT_LOCK(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { uint64_t bytes = MIN(PAGESIZE - off, len); again: if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && vm_page_is_valid(m, (vm_offset_t)off, bytes)) { if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) goto again; vm_page_busy(m); VM_OBJECT_UNLOCK(obj); if (dirbytes > 0) { error = dmu_read_uio(os, zp->z_id, uio, dirbytes); dirbytes = 0; } if (error == 0) { sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = uiomove(va + off, bytes, UIO_READ, uio); sf_buf_free(sf); sched_unpin(); } VM_OBJECT_LOCK(obj); vm_page_wakeup(m); } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { /* * The code below is here to make sendfile(2) work * correctly with ZFS. As pointed out by ups@ * sendfile(2) should be changed to use VOP_GETPAGES(), * but it pessimize performance of sendfile/UFS, that's * why I handle this special case in ZFS code. */ if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) goto again; vm_page_busy(m); VM_OBJECT_UNLOCK(obj); if (dirbytes > 0) { error = dmu_read_uio(os, zp->z_id, uio, dirbytes); dirbytes = 0; } if (error == 0) { sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = dmu_read(os, zp->z_id, start + off, bytes, (void *)(va + off)); sf_buf_free(sf); sched_unpin(); } VM_OBJECT_LOCK(obj); vm_page_wakeup(m); if (error == 0) uio->uio_resid -= bytes; } else { dirbytes += bytes; } len -= bytes; off = 0; if (error) break; } VM_OBJECT_UNLOCK(obj); if (error == 0 && dirbytes > 0) error = dmu_read_uio(os, zp->z_id, uio, dirbytes); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 if success * error code if failure * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; ssize_t n, nbytes; int error; rl_t *rl; ZFS_ENTER(zfsvfs); /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (ioflag & FRSYNC) zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_phys->zp_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_phys->zp_size); n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (vn_has_cached_data(vp)) error = mappedread(vp, nbytes, uio); else error = dmu_read_uio(os, zp->z_id, uio, nbytes); if (error) break; n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Fault in the pages of the first n bytes specified by the uio structure. * 1 byte in each page is touched and the uio struct is unmodified. * Any error will exit this routine as this is only a best * attempt to get the pages resident. This is a copy of ufs_trans_touch(). */ static void zfs_prefault_write(ssize_t n, struct uio *uio) { struct iovec *iov; ulong_t cnt, incr; caddr_t p; if (uio->uio_segflg != UIO_USERSPACE) return; iov = uio->uio_iov; while (n) { cnt = MIN(iov->iov_len, n); if (cnt == 0) { /* empty iov entry */ iov++; continue; } n -= cnt; /* * touch each page in this segment. */ p = iov->iov_base; while (cnt) { if (fubyte(p) == -1) return; incr = MIN(cnt, PAGESIZE); p += incr; cnt -= incr; } /* * touch the last byte in case it straddles a page. */ p--; if (fubyte(p) == -1) return; iov++; } } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - IO_APPEND flag set if in append mode. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. */ zfs_prefault_write(n, uio); /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & IO_APPEND) { /* * Range lock for a file append: * The value for the start of range will be determined by * zfs_range_lock() (to guarantee append semantics). * If this write will cause the block size to increase, * zfs_range_lock() will lock the entire file, so we must * later reduce the range after we grow the block size. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); if (rl->r_len == UINT64_MAX) { /* overlocked, zp_size can't change */ woff = uio->uio_loffset = zp->z_phys->zp_size; } else { woff = uio->uio_loffset = rl->r_off; } } else { woff = uio->uio_loffset; /* * Validate file offset */ if (woff < 0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * If we need to grow the block size then zfs_range_lock() * will lock a wider range than we request here. * Later after growing the block size we reduce the range. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* * Check for mandatory locks */ if (MANDMODE((mode_t)zp->z_phys->zp_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (error); } end_size = MAX(zp->z_phys->zp_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { /* * Start a transaction. */ woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); continue; } dmu_tx_abort(tx); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_phys->zp_size) vnode_pager_setsize(vp, woff + nbytes); rw_enter(&zp->z_map_lock, RW_READER); tx_bytes = uio->uio_resid; if (vn_has_cached_data(vp)) { rw_exit(&zp->z_map_lock); error = mappedwrite(vp, nbytes, uio, tx); } else { error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); rw_exit(&zp->z_map_lock); } tx_bytes -= uio->uio_resid; /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) { zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); } mutex_exit(&zp->z_acl_lock); /* * Update time stamp. NOTE: This marks the bonus buffer as * dirty, so we don't have to do it again for zp_size. */ zfs_time_stamper(zp, CONTENT_MODIFIED, tx); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, uio->uio_loffset); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC)) zil_commit(zilog, zp->z_last_itx, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(dmu_buf_t *db, void *vzgd) { zgd_t *zgd = (zgd_t *)vzgd; rl_t *rl = zgd->zgd_rl; vnode_t *vp = ZTOV(rl->r_zp); int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); VN_RELE(vp); zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); kmem_free(zgd, sizeof (zgd_t)); VFS_UNLOCK_GIANT(vfslocked); } /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t off = lr->lr_offset; dmu_buf_t *db; rl_t *rl; zgd_t *zgd; int dlen = lr->lr_length; /* length of user data */ int error = 0; ASSERT(zio); ASSERT(dlen != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { VN_RELE(ZTOV(zp)); return (ENOENT); } /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ rl = zfs_range_lock(zp, off, dlen, RL_READER); /* test for truncation needs to be done while range locked */ if (off >= zp->z_phys->zp_size) { error = ENOENT; goto out; } VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); } else { /* indirect write */ uint64_t boff; /* block starting offset */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { if (ISP2(zp->z_blksz)) { boff = P2ALIGN_TYPED(off, zp->z_blksz, uint64_t); } else { boff = 0; } dlen = zp->z_blksz; rl = zfs_range_lock(zp, boff, dlen, RL_READER); if (zp->z_blksz == dlen) break; zfs_range_unlock(rl); } /* test for truncation needs to be done while range locked */ if (off >= zp->z_phys->zp_size) { error = ENOENT; goto out; } zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_rl = rl; zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_bp = &lr->lr_blkptr; VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); ASSERT(boff == db->db_offset); lr->lr_blkoff = off - boff; error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz); if (error == 0) { zil_add_vdev(zfsvfs->z_log, DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); } /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before * we can release this dbuf. We will finish everything * up in the zfs_get_done() callback. */ if (error == EINPROGRESS) return (0); dmu_buf_rele(db, zgd); kmem_free(zgd, sizeof (zgd_t)); } out: zfs_range_unlock(rl); VN_RELE(ZTOV(zp)); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); error = zfs_zaccess_rwx(zp, mode, cr); ZFS_EXIT(zfsvfs); return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 if success * error code if failure * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); *vpp = NULL; #ifdef TODO if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_phys->zp_flags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { VN_RELE(*vpp); } ZFS_EXIT(zfsvfs); return (error); } #endif /* TODO */ if (dvp->v_type != VDIR) { ZFS_EXIT(zfsvfs); return (ENOTDIR); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { /* * Convert device special files */ if (IS_DEVVP(*vpp)) { vnode_t *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) error = ENOSYS; else *vpp = svp; } } ZFS_EXIT(zfsvfs); /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { int ltype = 0; if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp, td); VOP_UNLOCK(dvp, 0, td); } - error = vn_lock(*vpp, cnp->cn_lkflags, td); + error = vn_lock(*vpp, cnp->cn_lkflags); if (cnp->cn_flags & ISDOTDOT) - vn_lock(dvp, ltype | LK_RETRY, td); + vn_lock(dvp, ltype | LK_RETRY); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; return (error); } } #ifdef FREEBSD_NAMECACHE /* * Insert name into cache (as non-existent) if appropriate. */ if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, *vpp, cnp); /* * Insert name into cache if appropriate. */ if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } #endif return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; objset_t *os = zfsvfs->z_os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uint64_t zoid; ZFS_ENTER(zfsvfs); top: *vpp = NULL; if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~VSVTX; if (*name == '\0') { /* * Null component name refers to the directory itself. */ VN_HOLD(dvp); zp = dzp; dl = NULL; error = 0; } else { /* possible VN_HOLD(zp) */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); return (error); } } zoid = zp ? zp->z_id : -1ULL; if (zp == NULL) { /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_phys->zp_flags & ZFS_XATTR) && (vap->va_type != VREG)) { error = EINVAL; goto out; } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); ASSERT(zp->z_id == zoid); (void) zfs_link_create(dl, zp, tx, ZNEW); zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); dmu_tx_commit(tx); } else { /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl == EXCL) { error = EEXIST; goto out; } /* * Can't open a directory for writing. */ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { error = EISDIR; goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { error = zfs_freesp(zp, 0, 0, mode, TRUE); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { /* NB: we already did dmu_tx_wait() */ zfs_dirent_unlock(dl); VN_RELE(ZTOV(zp)); goto top; } } } out: if (error == 0) { *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); } if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); /* * If vnode is for a device return a specfs vnode instead. */ if (IS_DEVVP(*vpp)) { struct vnode *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) { error = ENOSYS; } *vpp = svp; } } ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); znode_t *xzp = NULL; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t acl_obj, xattr_obj; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked; int error; ZFS_ENTER(zfsvfs); top: /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = EPERM; goto out; } vnevent_remove(vp); dnlc_remove(dvp, name); may_delete_now = FALSE; /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); if (may_delete_now) dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); /* are there any extended attributes? */ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { /* XXX - do we need this if we are deleting? */ dmu_tx_hold_bonus(tx, xattr_obj); } /* are there any additional acls */ if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, 0, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (0 && unlinked) { VI_LOCK(vp); delete_now = may_delete_now && vp->v_count == 1 && !vn_has_cached_data(vp) && zp->z_phys->zp_xattr == xattr_obj && zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; VI_UNLOCK(vp); } if (delete_now) { if (zp->z_phys->zp_xattr) { error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); ASSERT3U(error, ==, 0); ASSERT3U(xzp->z_phys->zp_links, ==, 2); dmu_buf_will_dirty(xzp->z_dbuf, tx); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; xzp->z_phys->zp_links = 0; mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); zp->z_phys->zp_xattr = 0; /* probably unnecessary */ } mutex_enter(&zp->z_lock); VI_LOCK(vp); vp->v_count--; ASSERT3U(vp->v_count, ==, 0); VI_UNLOCK(vp); mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); VFS_RELE(zfsvfs->z_vfs); } else if (unlinked) { zfs_unlinked_add(zp, tx); } zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); dmu_tx_commit(tx); out: zfs_dirent_unlock(dl); if (!delete_now) { VN_RELE(vp); } else if (xzp) { /* this rele delayed to prevent nesting transactions */ VN_RELE(ZTOV(xzp)); } ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * * OUT: vpp - vnode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; zfs_dirlock_t *dl; uint64_t zoid = 0; dmu_tx_t *tx; int error; ASSERT(vap->va_type == VDIR); ZFS_ENTER(zfsvfs); if (dzp->z_phys->zp_flags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } top: *vpp = NULL; /* * First make sure the new directory doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); /* * Now put new name in parent dir. */ (void) zfs_link_create(dl, zp, tx, ZNEW); *vpp = ZTOV(zp); zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); dmu_tx_commit(tx); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated */ static int zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } if (vp == cwd) { error = EINVAL; goto out; } vnevent_rmdir(vp); /* * Grab a lock on the directory to make sure that noone is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } #ifdef FREEBSD_NAMECACHE cache_purge(dvp); #endif error = zfs_link_destroy(dl, zp, tx, 0, NULL); if (error == 0) zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); #ifdef FREEBSD_NAMECACHE cache_purge(vp); #endif out: zfs_dirent_unlock(dl); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure. * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ int local_eof; int outcount; int error; uint8_t prefetch; uint8_t type; int ncooks; u_long *cooks = NULL; ZFS_ENTER(zfsvfs); /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; odp = (struct dirent64 *)iovp->iov_base; } if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); objnum = zp->z_phys->zp_parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = ENXIO; goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); } reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = EINVAL; goto update; } break; } /* * Add this entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; outcount += reclen; odp = (dirent64_t *)((intptr_t)odp + reclen); ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0); /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * flags - [UNUSED] * cr - credentials of caller. * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_phys_t *pzp = zp->z_phys; uint32_t blksize; u_longlong_t nblocks; int error; ZFS_ENTER(zfsvfs); /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ mutex_enter(&zp->z_lock); vap->va_type = IFTOVT(pzp->zp_mode); vap->va_mode = pzp->zp_mode & ~S_IFMT; vap->va_uid = zp->z_phys->zp_uid; vap->va_gid = zp->z_phys->zp_gid; vap->va_nodeid = zp->z_id; vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ vap->va_size = pzp->zp_size; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && (zp->z_phys->zp_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zp->z_lock); dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * flags - ATTR_UTIME set if non-default time values provided. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { struct znode *zp = VTOZ(vp); znode_phys_t *pzp = zp->z_phys; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; dmu_tx_t *tx; vattr_t oldva; uint_t mask = vap->va_mask; uint_t saved_mask; int trim_mask = 0; uint64_t new_mode; znode_t *attrzp; int need_policy = FALSE; int err; if (mask == 0) return (0); if (mask & AT_NOSET) return (EINVAL); if (mask & AT_SIZE && vp->v_type == VDIR) return (EISDIR); if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) return (EINVAL); ZFS_ENTER(zfsvfs); top: attrzp = NULL; if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (EROFS); } /* * First validate permissions */ if (mask & AT_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ do { err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); /* NB: we already did dmu_tx_wait() if necessary */ } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME)) need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = pzp->zp_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = pzp->zp_mode; oldva.va_uid = zp->z_phys->zp_uid; oldva.va_gid = zp->z_phys->zp_gid; mutex_exit(&zp->z_lock); if (mask & AT_MODE) { if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_phys->zp_acl.z_acl_extern_obj) dmu_tx_hold_write(tx, pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); else dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); } if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); if (err) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (err); } dmu_tx_hold_bonus(tx, attrzp->z_id); } err = dmu_tx_assign(tx, zfsvfs->z_assign); if (err) { if (attrzp) VN_RELE(ZTOV(attrzp)); if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (err); } dmu_buf_will_dirty(zp->z_dbuf, tx); /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ mutex_enter(&zp->z_lock); if (mask & AT_MODE) { err = zfs_acl_chmod_setattr(zp, new_mode, tx); ASSERT3U(err, ==, 0); } if (attrzp) mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { zp->z_phys->zp_uid = (uint64_t)vap->va_uid; if (attrzp) { attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; } } if (mask & AT_GID) { zp->z_phys->zp_gid = (uint64_t)vap->va_gid; if (attrzp) attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; } if (attrzp) mutex_exit(&attrzp->z_lock); if (mask & AT_ATIME) ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); if (mask & AT_MTIME) ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); if (mask & AT_SIZE) zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); else if (mask != 0) zfs_time_stamper_locked(zp, STATE_CHANGED, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); mutex_exit(&zp->z_lock); if (attrzp) VN_RELE(ZTOV(attrzp)); dmu_tx_commit(tx); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) VN_RELE(ZTOV(zl->zl_znode)); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; uint64_t *oidp = &zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = &zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (*oidp == szp->z_id) /* We're a descendant of szp */ return (EINVAL); if (*oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); if (error) return (error); zl->zl_znode = zp; } oidp = &zp->z_phys->zp_parent; rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ static int zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr, error; ZFS_ENTER(zfsvfs); /* * Make sure we have the real vp for the target directory. */ if (VOP_REALVP(tdvp, &realvp) == 0) tdvp = realvp; if (tdvp->v_vfsp != sdvp->v_vfsp) { ZFS_EXIT(zfsvfs); return (EXDEV); } tdzp = VTOZ(tdvp); top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != (sdzp->z_phys->zp_flags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { cmp = strcmp(snm, tnm); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) VN_RELE(ZTOV(tzp)); } if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); VN_RELE(ZTOV(szp)); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto out; if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (ZTOV(szp)->v_type == VDIR) { if (ZTOV(tzp)->v_type != VDIR) { error = ENOTDIR; goto out; } } else { if (ZTOV(tzp)->v_type == VDIR) { error = EISDIR; goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } vnevent_rename_src(ZTOV(szp)); if (tzp) vnevent_rename_dest(ZTOV(tzp)); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ if (tzp) dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); ASSERT(error == 0); zfs_log_rename(zilog, tx, TX_RENAME, sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); } #ifdef FREEBSD_NAMECACHE if (error == 0) { cache_purge(sdvp); cache_purge(tdvp); } #endif } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * target - Target path of new symlink. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated */ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t zoid; int len = strlen(link); int error; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); top: if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (ENAMETOOLONG); } /* * Attempt to lock directory; fail if entry already exists. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } dmu_buf_will_dirty(dzp->z_dbuf, tx); /* * Create a new object for the symlink. * Put the link content into bonus buffer if it will fit; * otherwise, store it just like any other file data. */ zoid = 0; if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. */ zfs_grow_blocksize(zp, len, tx); VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); bcopy(link, dbp->db_data, len); dmu_buf_rele(dbp, FTAG); } zp->z_phys->zp_size = len; /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); out: if (error == 0) { zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link); *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); } dmu_tx_commit(tx); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uoip - structure to contain the link path. * cr - credentials of caller. * * OUT: uio - structure to contain the link path. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; size_t bufsz; int error; ZFS_ENTER(zfsvfs); bufsz = (size_t)zp->z_phys->zp_size; if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { error = uiomove(zp->z_phys + 1, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); } else { dmu_buf_t *dbp; error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); if (error) { ZFS_EXIT(zfsvfs); return (error); } error = uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); dmu_buf_rele(dbp, FTAG); } ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; int error; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); if (VOP_REALVP(svp, &realvp) == 0) svp = realvp; if (svp->v_vfsp != tdvp->v_vfsp) { ZFS_EXIT(zfsvfs); return (EXDEV); } szp = VTOZ(svp); top: /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_phys->zp_flags & ZFS_XATTR) != (dzp->z_phys->zp_flags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (EPERM); } if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dl, szp, tx, 0); if (error == 0) zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); dmu_tx_commit(tx); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } void zfs_inactive(vnode_t *vp, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_um_lock, RW_READER); if (zfsvfs->z_unmounted2) { ASSERT(zp->z_dbuf_held == 0); mutex_enter(&zp->z_lock); VI_LOCK(vp); vp->v_count = 0; /* count arrives as 1 */ VI_UNLOCK(vp); if (zp->z_dbuf == NULL) { mutex_exit(&zp->z_lock); zfs_znode_free(zp); } else { mutex_exit(&zp->z_lock); } rw_exit(&zfsvfs->z_um_lock); VFS_RELE(zfsvfs->z_vfs); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_buf_will_dirty(zp->z_dbuf, tx); mutex_enter(&zp->z_lock); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); rw_exit(&zfsvfs->z_um_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); static int zfs_fid(vnode_t *vp, fid_t *fidp) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen = (uint32_t)zp->z_phys->zp_gen; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i; ZFS_ENTER(zfsvfs); size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; fidp->fid_len = size; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; zfs_dirlock_t *dl; int error; switch (cmd) { case _PC_LINK_MAX: *valp = INT_MAX; return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #if 0 case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); *valp = 0; error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { zfs_dirent_unlock(dl); if (!zfs_dirempty(xzp)) *valp = 1; VN_RELE(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); #endif case _PC_ACL_EXTENDED: *valp = 0; /* TODO */ return (0); case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); default: return (EOPNOTSUPP); } } #ifdef TODO /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); error = zfs_getacl(zp, vsecp, cr); ZFS_EXIT(zfsvfs); return (error); } #endif /* TODO */ #ifdef TODO /*ARGSUSED*/ static int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); error = zfs_setacl(zp, vsecp, cr); ZFS_EXIT(zfsvfs); return (error); } #endif /* TODO */ static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred); if (error == 0) vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL)); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred)); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred)); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vattr_t *vap = ap->a_vap; /* No support for FreeBSD's chflags(2). */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & SAVENAME); ASSERT(ap->a_tcnp->cn_flags & SAVENAME); error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred); if (tdvp == tvp) VN_RELE(tdvp); else VN_URELE(tdvp); if (tvp) VN_URELE(tvp); VN_RELE(fdvp); VN_RELE(fvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, ap->a_target, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs; int rele = 1; ASSERT(zp != NULL); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); mutex_enter(&zp->z_lock); ASSERT(zp->z_phys); ASSERT(zp->z_dbuf_held); zfsvfs = zp->z_zfsvfs; if (!zp->z_unlinked) { zp->z_dbuf_held = 0; ZTOV(zp) = NULL; mutex_exit(&zp->z_lock); dmu_buf_rele(zp->z_dbuf, NULL); } else { mutex_exit(&zp->z_lock); } VI_LOCK(vp); if (vp->v_count > 0) rele = 0; vp->v_data = NULL; ASSERT(vp->v_holdcnt >= 1); VI_UNLOCK(vp); if (!zp->z_unlinked && rele) VFS_RELE(zfsvfs->z_vfs); return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred); if (error == 0) *ap->a_retval = val; else if (error == EOPNOTSUPP) error = vop_stdpathconf(ap); return (error); } /* * Advisory record locking support */ static int zfs_freebsd_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { znode_t *zp = VTOZ(ap->a_vp); return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size)); } struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, #ifdef FREEBSD_NAMECACHE .vop_lookup = vfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, #else .vop_lookup = zfs_freebsd_lookup, #endif .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_advlock = zfs_freebsd_advlock, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = VOP_EOPNOTSUPP, .vop_fid = zfs_freebsd_fid, }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_fid = zfs_freebsd_fid, }; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 175201) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 175202) @@ -1,1072 +1,1072 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Portions Copyright 2007 Jeremy Teo */ #pragma ident "%Z%%M% %I% %E% SMI" #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), "sizeof(znode_t)"); /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL struct kmem_cache *znode_cache = NULL; /*ARGSUSED*/ static void znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) { znode_t *zp = user_ptr; vnode_t *vp; mutex_enter(&zp->z_lock); vp = ZTOV(zp); if (vp == NULL) { mutex_exit(&zp->z_lock); zfs_znode_free(zp); } else if (vp->v_count == 0) { ZTOV(zp) = NULL; vhold(vp); mutex_exit(&zp->z_lock); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp, curthread); VOP_UNLOCK(vp, 0, curthread); vdrop(vp); zfs_znode_free(zp); } else { /* signal force unmount that this znode can be freed */ zp->z_dbuf = NULL; mutex_exit(&zp->z_lock); } } extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; /* * XXX: We cannot use this function as a cache constructor, because * there is one global cache for all file systems and we need * to pass vfsp here, which is not possible, because argument * 'cdrarg' is defined at kmem_cache_create() time. */ static int zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) { znode_t *zp = buf; vnode_t *vp; vfs_t *vfsp = cdrarg; int error; if (cdrarg != NULL) { error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); ASSERT(error == 0); zp->z_vnode = vp; vp->v_data = (caddr_t)zp; vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; } else { zp->z_vnode = NULL; } mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zp->z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); zp->z_dbuf_held = 0; zp->z_dirlocks = 0; zp->z_lockf = NULL; return (0); } /*ARGSUSED*/ static void zfs_znode_cache_destructor(void *buf, void *cdarg) { znode_t *zp = buf; ASSERT(zp->z_dirlocks == 0); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); mutex_destroy(&zp->z_range_lock); avl_destroy(&zp->z_range_avl); ASSERT(zp->z_dbuf_held == 0); } void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; } /* * zfs_init_fs - Initialize the zfsvfs struct and the file system * incore "master" object. Verify version compatibility. */ int zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) { objset_t *os = zfsvfs->z_os; uint64_t version = ZPL_VERSION; int i, error; dmu_object_info_t doi; uint64_t fsid_guid; *zpp = NULL; /* * XXX - hack to auto-create the pool root filesystem at * the first attempted mount. */ if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ error = dmu_tx_assign(tx, TXG_WAIT); ASSERT3U(error, ==, 0); zfs_create_fs(os, cr, tx); dmu_tx_commit(tx); } error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1, &version); if (error) { return (error); } else if (version != ZPL_VERSION) { (void) printf("Mismatched versions: File system " "is version %lld on-disk format, which is " "incompatible with this software version %lld!", (u_longlong_t)version, ZPL_VERSION); return (ENOTSUP); } /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error) return (error); ASSERT(zfsvfs->z_root != 0); /* * Create the per mount vop tables. */ /* * Initialize zget mutex's */ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); if (error) return (error); ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error) return (error); return (0); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif #ifndef major #define major(x) ((int)(((u_int)(x) >> 8)&0xff)) /* major number */ #endif #ifndef minor #define minor(x) ((int)((x)&0xffff00ff)) /* minor number */ #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } /* * Construct a new znode/vnode and intialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) { znode_t *zp; vnode_t *vp; int error; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0); ASSERT(zp->z_dirlocks == NULL); zp->z_phys = db->db_data; zp->z_zfsvfs = zfsvfs; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_dbuf_held = 0; zp->z_mapcnt = 0; zp->z_last_itx = 0; zp->z_dbuf = db; zp->z_id = obj_num; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); vp = ZTOV(zp); if (vp == NULL) return (zp); error = insmntque(vp, zfsvfs->z_vfs); KASSERT(error == 0, ("insmntque() failed: error %d", error)); vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; case VFIFO: vp->v_op = &zfs_fifoops; break; } return (zp); } static void zfs_znode_dmu_init(znode_t *zp) { znode_t *nzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_buf_t *db = zp->z_dbuf; mutex_enter(&zp->z_lock); nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func); /* * there should be no * concurrent zgets on this object. */ ASSERT3P(nzp, ==, NULL); /* * Slap on VROOT if we are the root znode */ if (zp->z_id == zfsvfs->z_root) { ZTOV(zp)->v_flag |= VROOT; } ASSERT(zp->z_dbuf_held == 0); zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); mutex_exit(&zp->z_lock); } /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * IS_REPLAY - intent log replay * * OUT: oid - ID of created object * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, int bonuslen) { dmu_buf_t *dbp; znode_phys_t *pzp; znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; timestruc_t now; uint64_t gen; int err; ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ *oid = vap->va_nodeid; flag |= IS_REPLAY; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { *oid = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); } /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be to needed allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (flag & IS_REPLAY) { err = zap_create_claim(zfsvfs->z_os, *oid, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); ASSERT3U(err, ==, 0); } else { *oid = zap_create(zfsvfs->z_os, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } else { if (flag & IS_REPLAY) { err = dmu_object_claim(zfsvfs->z_os, *oid, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); ASSERT3U(err, ==, 0); } else { *oid = dmu_object_alloc(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); dmu_buf_will_dirty(dbp, tx); /* * Initialize the znode physical data to zero. */ ASSERT(dbp->db_size >= sizeof (znode_phys_t)); bzero(dbp->db_data, dbp->db_size); pzp = dbp->db_data; /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_phys = pzp; dzp->z_id = *oid; } /* * If parent is an xattr, so am I. */ if (dzp->z_phys->zp_flags & ZFS_XATTR) flag |= IS_XATTR; if (vap->va_type == VBLK || vap->va_type == VCHR) { pzp->zp_rdev = zfs_expldev(vap->va_rdev); } if (vap->va_type == VDIR) { pzp->zp_size = 2; /* contents ("." and "..") */ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } pzp->zp_parent = dzp->z_id; if (flag & IS_XATTR) pzp->zp_flags |= ZFS_XATTR; pzp->zp_gen = gen; ZFS_TIME_ENCODE(&now, pzp->zp_crtime); ZFS_TIME_ENCODE(&now, pzp->zp_ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); } else { ZFS_TIME_ENCODE(&now, pzp->zp_atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); } else { ZFS_TIME_ENCODE(&now, pzp->zp_mtime); } pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); zfs_perm_init(zp, dzp, flag, vap, tx, cr); if (zpp) { kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); mutex_enter(hash_mtx); zfs_znode_dmu_init(zp); mutex_exit(hash_mtx); *zpp = zp; } else { if (ZTOV(zp) != NULL) ZTOV(zp)->v_count = 0; dmu_buf_rele(dbp, NULL); zfs_znode_free(zp); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; int err; *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_ZNODE || doi.doi_bonus_size < sizeof (znode_phys_t)) { dmu_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EINVAL); } ASSERT(db->db_object == obj_num); ASSERT(db->db_offset == -1); ASSERT(db->db_data != NULL); zp = dmu_buf_get_user(db); if (zp != NULL) { mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { dmu_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (ENOENT); } else if (zp->z_dbuf_held) { dmu_buf_rele(db, NULL); } else { zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); } if (ZTOV(zp) != NULL) VN_HOLD(ZTOV(zp)); else { err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops, &zp->z_vnode); ASSERT(err == 0); vp = ZTOV(zp); vp->v_data = (caddr_t)zp; vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); if (vp->v_type == VDIR) zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ err = insmntque(vp, zfsvfs->z_vfs); KASSERT(err == 0, ("insmntque() failed: error %d", err)); } mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; return (0); } /* * Not found create new znode/vnode */ zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); ASSERT3U(zp->z_id, ==, obj_num); zfs_znode_dmu_init(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); if (zp->z_phys->zp_acl.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, zp->z_phys->zp_acl.z_acl_extern_obj, tx); ASSERT3U(error, ==, 0); } error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); ASSERT3U(error, ==, 0); zp->z_dbuf_held = 0; ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); dmu_buf_rele(zp->z_dbuf, NULL); } void zfs_zinactive(znode_t *zp) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT(zp->z_dbuf_held && zp->z_phys); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); mutex_enter(&zp->z_lock); VI_LOCK(vp); if (vp->v_count > 0) { /* * If the hold count is greater than zero, somebody has * obtained a new reference on this znode while we were * processing it here, so we are done. */ VI_UNLOCK(vp); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); return; } VI_UNLOCK(vp); /* * If this was the last reference to a file with no links, * remove the file from the file system. */ if (zp->z_unlinked) { ZTOV(zp) = NULL; mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); ASSERT(vp->v_count == 0); vrecycle(vp, curthread); zfs_rmnode(zp); VFS_RELE(zfsvfs->z_vfs); return; } ASSERT(zp->z_phys); ASSERT(zp->z_dbuf_held); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; mutex_enter(&zfsvfs->z_znodes_lock); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); kmem_cache_free(znode_cache, zp); } void zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) { timestruc_t now; ASSERT(MUTEX_HELD(&zp->z_lock)); gethrestime(&now); if (tx) { dmu_buf_will_dirty(zp->z_dbuf, tx); zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); if (flag & AT_MTIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); if (flag & AT_CTIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); } /* * Update the requested znode timestamps with the current time. * If we are in a transaction, then go ahead and mark the znode * dirty in the transaction so the timestamps will go to disk. * Otherwise, we will get pushed next time the znode is updated * in a transaction, or when this znode eventually goes inactive. * * Why is this OK? * 1 - Only the ACCESS time is ever updated outside of a transaction. * 2 - Multiple consecutive updates will be collapsed into a single * znode update by the transaction grouping semantics of the DMU. */ void zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) { mutex_enter(&zp->z_lock); zfs_time_stamper_locked(zp, flag, tx); mutex_exit(&zp->z_lock); } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT3U(error, ==, 0); /* What blocksize did we actually get? */ dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free (0 => to EOF). * flag - current file open mode flags. * * RETURN: 0 if success * error code if failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; rl_t *rl; uint64_t end = off + len; uint64_t size, new_blksz; int error; if (ZTOV(zp)->v_type == VFIFO) return (0); /* * If we will change zp_size then lock the whole file, * otherwise just lock the range being freed. */ if (len == 0 || off + len > zp->z_phys->zp_size) { rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); } else { rl = zfs_range_lock(zp, off, len, RL_WRITER); /* recheck, in case zp_size changed */ if (off + len > zp->z_phys->zp_size) { /* lost race: file size changed, lock whole file */ zfs_range_unlock(rl); rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); } } /* * Nothing to do if file already at desired length. */ size = zp->z_phys->zp_size; if (len == 0 && size == off && off != 0) { zfs_range_unlock(rl); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); new_blksz = 0; if (end > size && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end, SPA_MAXBLOCKSIZE); } else { new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz)); } else if (off < size) { /* * If len == 0, we are truncating the file. */ dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) dmu_tx_wait(tx); dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } if (new_blksz) zfs_grow_blocksize(zp, new_blksz, tx); if (end > size || len == 0) zp->z_phys->zp_size = end; if (off < size) { objset_t *os = zfsvfs->z_os; uint64_t rlen = len; if (len == 0) rlen = -1; else if (end > size) rlen = size - off; VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx)); } if (log) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); } zfs_range_unlock(rl); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ rw_enter(&zp->z_map_lock, RW_WRITER); if (end > size) vnode_pager_setsize(vp, end); else if (len == 0) { #if 0 error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); #else error = vinvalbuf(vp, V_SAVE, curthread, 0, 0); vnode_pager_setsize(vp, end); #endif } rw_exit(&zp->z_map_lock); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) { zfsvfs_t zfsvfs; uint64_t moid, doid, roid = 0; uint64_t version = ZPL_VERSION; int error; znode_t *rootzp = NULL; vattr_t vattr; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx); ASSERT(error == 0); /* * Create a delete queue. */ doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = UID_ROOT; vattr.va_gid = GID_WHEEL; rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); zfs_znode_cache_constructor(rootzp, NULL, 0); rootzp->z_zfsvfs = &zfsvfs; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_dbuf_held = 0; bzero(&zfsvfs, sizeof (zfsvfs_t)); zfsvfs.z_os = os; zfsvfs.z_assign = TXG_NOWAIT; zfsvfs.z_parent = &zfsvfs; mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); ASSERT3U(rootzp->z_id, ==, roid); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); ASSERT(error == 0); mutex_destroy(&zfsvfs.z_znodes_lock); kmem_cache_free(znode_cache, rootzp); } #endif /* _KERNEL */ /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) { dmu_buf_t *db; dmu_object_info_t doi; znode_phys_t *zp; int error; if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) return (error); dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_ZNODE || doi.doi_bonus_size < sizeof (znode_phys_t)) { dmu_buf_rele(db, FTAG); return (EINVAL); } zp = db->db_data; *pobjp = zp->zp_parent; *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && S_ISDIR(zp->zp_mode); dmu_buf_rele(db, FTAG); return (0); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { char *path = buf + len - 1; int error; *path = '\0'; for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if ((error = zfs_obj_to_pobj(osp, obj, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); bcopy(component, path, complen); obj = pobj; } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } Index: head/sys/compat/linprocfs/linprocfs.c =================================================================== --- head/sys/compat/linprocfs/linprocfs.c (revision 175201) +++ head/sys/compat/linprocfs/linprocfs.c (revision 175202) @@ -1,1275 +1,1275 @@ /*- * Copyright (c) 2000 Dag-Erling Coïdan Smørgrav * Copyright (c) 1999 Pierre Beyssac * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.4 (Berkeley) 6/15/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif /* __i386__ || __amd64__ */ #include "opt_compat.h" #ifdef COMPAT_LINUX32 /* XXX */ #include #else #include #endif #include #include #include #include #include /* * Various conversion macros */ #define T2J(x) (((x) * 100UL) / (stathz ? stathz : hz)) /* ticks to jiffies */ #define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */ #define B2K(x) ((x) >> 10) /* bytes to kbytes */ #define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */ #define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */ #define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */ /** * @brief Mapping of ki_stat in struct kinfo_proc to the linux state * * The linux procfs state field displays one of the characters RSDZTW to * denote running, sleeping in an interruptible wait, waiting in an * uninterruptible disk sleep, a zombie process, process is being traced * or stopped, or process is paging respectively. * * Our struct kinfo_proc contains the variable ki_stat which contains a * value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK. * * This character array is used with ki_stati-1 as an index and tries to * map our states to suitable linux states. */ static char linux_state[] = "RRSTZDD"; /* * Filler function for proc/meminfo */ static int linprocfs_domeminfo(PFS_FILL_ARGS) { unsigned long memtotal; /* total memory in bytes */ unsigned long memused; /* used memory in bytes */ unsigned long memfree; /* free memory in bytes */ unsigned long memshared; /* shared memory ??? */ unsigned long buffers, cached; /* buffer / cache memory ??? */ unsigned long long swaptotal; /* total swap space in bytes */ unsigned long long swapused; /* used swap space in bytes */ unsigned long long swapfree; /* free swap space in bytes */ vm_object_t object; int i, j; memtotal = physmem * PAGE_SIZE; /* * The correct thing here would be: * memfree = cnt.v_free_count * PAGE_SIZE; memused = memtotal - memfree; * * but it might mislead linux binaries into thinking there * is very little memory left, so we cheat and tell them that * all memory that isn't wired down is free. */ memused = cnt.v_wire_count * PAGE_SIZE; memfree = memtotal - memused; swap_pager_status(&i, &j); swaptotal = (unsigned long long)i * PAGE_SIZE; swapused = (unsigned long long)j * PAGE_SIZE; swapfree = swaptotal - swapused; memshared = 0; mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) if (object->shadow_count > 1) memshared += object->resident_page_count; mtx_unlock(&vm_object_list_mtx); memshared *= PAGE_SIZE; /* * We'd love to be able to write: * buffers = bufspace; * * but bufspace is internal to vfs_bio.c and we don't feel * like unstaticizing it just for linprocfs's sake. */ buffers = 0; cached = cnt.v_cache_count * PAGE_SIZE; sbuf_printf(sb, " total: used: free: shared: buffers: cached:\n" "Mem: %lu %lu %lu %lu %lu %lu\n" "Swap: %llu %llu %llu\n" "MemTotal: %9lu kB\n" "MemFree: %9lu kB\n" "MemShared:%9lu kB\n" "Buffers: %9lu kB\n" "Cached: %9lu kB\n" "SwapTotal:%9llu kB\n" "SwapFree: %9llu kB\n", memtotal, memused, memfree, memshared, buffers, cached, swaptotal, swapused, swapfree, B2K(memtotal), B2K(memfree), B2K(memshared), B2K(buffers), B2K(cached), B2K(swaptotal), B2K(swapfree)); return (0); } #if defined(__i386__) || defined(__amd64__) /* * Filler function for proc/cpuinfo (i386 & amd64 version) */ static int linprocfs_docpuinfo(PFS_FILL_ARGS) { int hw_model[2]; char model[128]; size_t size; int class, fqmhz, fqkhz; int i; /* * We default the flags to include all non-conflicting flags, * and the Intel versions of conflicting flags. */ static char *flags[] = { "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", "sep", "sep", "mtrr", "pge", "mca", "cmov", "pat", "pse36", "pn", "b19", "b20", "b21", "mmxext", "mmx", "fxsr", "xmm", "b26", "b27", "b28", "b29", "3dnowext", "3dnow" }; switch (cpu_class) { #ifdef __i386__ case CPUCLASS_286: class = 2; break; case CPUCLASS_386: class = 3; break; case CPUCLASS_486: class = 4; break; case CPUCLASS_586: class = 5; break; case CPUCLASS_686: class = 6; break; default: class = 0; break; #else /* __amd64__ */ default: class = 15; break; #endif } hw_model[0] = CTL_HW; hw_model[1] = HW_MODEL; model[0] = '\0'; size = sizeof(model); if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0) strcpy(model, "unknown"); for (i = 0; i < mp_ncpus; ++i) { sbuf_printf(sb, "processor\t: %d\n" "vendor_id\t: %.20s\n" "cpu family\t: %d\n" "model\t\t: %d\n" "model name\t: %s\n" "stepping\t: %d\n", i, cpu_vendor, class, cpu, model, cpu_id & 0xf); /* XXX per-cpu vendor / class / model / id? */ } sbuf_cat(sb, "flags\t\t:"); if (!strcmp(cpu_vendor, "AuthenticAMD") && (class < 6)) { flags[16] = "fcmov"; } else if (!strcmp(cpu_vendor, "CyrixInstead")) { flags[24] = "cxmmx"; } for (i = 0; i < 32; i++) if (cpu_feature & (1 << i)) sbuf_printf(sb, " %s", flags[i]); sbuf_cat(sb, "\n"); if (class >= 5) { fqmhz = (tsc_freq + 4999) / 1000000; fqkhz = ((tsc_freq + 4999) / 10000) % 100; sbuf_printf(sb, "cpu MHz\t\t: %d.%02d\n" "bogomips\t: %d.%02d\n", fqmhz, fqkhz, fqmhz, fqkhz); } return (0); } #endif /* __i386__ || __amd64__ */ /* * Filler function for proc/mtab * * This file doesn't exist in Linux' procfs, but is included here so * users can symlink /compat/linux/etc/mtab to /proc/mtab */ static int linprocfs_domtab(PFS_FILL_ARGS) { struct nameidata nd; struct mount *mp; const char *lep; char *dlep, *flep, *mntto, *mntfrom, *fstype; size_t lep_len; int error; /* resolve symlinks etc. in the emulation tree prefix */ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, linux_emul_path, td); flep = NULL; error = namei(&nd); VFS_UNLOCK_GIANT(NDHASGIANT(&nd)); if (error != 0 || vn_fullpath(td, nd.ni_vp, &dlep, &flep) != 0) lep = linux_emul_path; else lep = dlep; lep_len = strlen(lep); mtx_lock(&mountlist_mtx); error = 0; TAILQ_FOREACH(mp, &mountlist, mnt_list) { /* determine device name */ mntfrom = mp->mnt_stat.f_mntfromname; /* determine mount point */ mntto = mp->mnt_stat.f_mntonname; if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/') mntto += lep_len; /* determine fs type */ fstype = mp->mnt_stat.f_fstypename; if (strcmp(fstype, pn->pn_info->pi_name) == 0) mntfrom = fstype = "proc"; else if (strcmp(fstype, "procfs") == 0) continue; if (strcmp(fstype, "linsysfs") == 0) { sbuf_printf(sb, "/sys %s sysfs %s", mntto, mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw"); } else { sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype, mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw"); } #define ADD_OPTION(opt, name) \ if (mp->mnt_stat.f_flags & (opt)) sbuf_printf(sb, "," name); ADD_OPTION(MNT_SYNCHRONOUS, "sync"); ADD_OPTION(MNT_NOEXEC, "noexec"); ADD_OPTION(MNT_NOSUID, "nosuid"); ADD_OPTION(MNT_UNION, "union"); ADD_OPTION(MNT_ASYNC, "async"); ADD_OPTION(MNT_SUIDDIR, "suiddir"); ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow"); ADD_OPTION(MNT_NOATIME, "noatime"); #undef ADD_OPTION /* a real Linux mtab will also show NFS options */ sbuf_printf(sb, " 0 0\n"); } mtx_unlock(&mountlist_mtx); if (flep != NULL) free(flep, M_TEMP); return (error); } /* * Filler function for proc/stat */ static int linprocfs_dostat(PFS_FILL_ARGS) { struct pcpu *pcpu; long cp_time[CPUSTATES]; long *cp; int i; read_cpu_time(cp_time); sbuf_printf(sb, "cpu %ld %ld %ld %ld\n", T2J(cp_time[CP_USER]), T2J(cp_time[CP_NICE]), T2J(cp_time[CP_SYS] /*+ cp_time[CP_INTR]*/), T2J(cp_time[CP_IDLE])); for (i = 0; i <= mp_maxid; ++i) { if (CPU_ABSENT(i)) continue; pcpu = pcpu_find(i); cp = pcpu->pc_cp_time; sbuf_printf(sb, "cpu%d %ld %ld %ld %ld\n", i, T2J(cp[CP_USER]), T2J(cp[CP_NICE]), T2J(cp[CP_SYS] /*+ cp[CP_INTR]*/), T2J(cp[CP_IDLE])); } sbuf_printf(sb, "disk 0 0 0 0\n" "page %u %u\n" "swap %u %u\n" "intr %u\n" "ctxt %u\n" "btime %lld\n", cnt.v_vnodepgsin, cnt.v_vnodepgsout, cnt.v_swappgsin, cnt.v_swappgsout, cnt.v_intr, cnt.v_swtch, (long long)boottime.tv_sec); return (0); } /* * Filler function for proc/uptime */ static int linprocfs_douptime(PFS_FILL_ARGS) { long cp_time[CPUSTATES]; struct timeval tv; getmicrouptime(&tv); read_cpu_time(cp_time); sbuf_printf(sb, "%lld.%02ld %ld.%02ld\n", (long long)tv.tv_sec, tv.tv_usec / 10000, T2S(cp_time[CP_IDLE]), T2J(cp_time[CP_IDLE]) % 100); return (0); } /* * Get OS build date */ static void linprocfs_osbuild(struct thread *td, struct sbuf *sb) { #if 0 char osbuild[256]; char *cp1, *cp2; strncpy(osbuild, version, 256); osbuild[255] = '\0'; cp1 = strstr(osbuild, "\n"); cp2 = strstr(osbuild, ":"); if (cp1 && cp2) { *cp1 = *cp2 = '\0'; cp1 = strstr(osbuild, "#"); } else cp1 = NULL; if (cp1) sbuf_printf(sb, "%s%s", cp1, cp2 + 1); else #endif sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977"); } /* * Get OS builder */ static void linprocfs_osbuilder(struct thread *td, struct sbuf *sb) { #if 0 char builder[256]; char *cp; cp = strstr(version, "\n "); if (cp) { strncpy(builder, cp + 5, 256); builder[255] = '\0'; cp = strstr(builder, ":"); if (cp) *cp = '\0'; } if (cp) sbuf_cat(sb, builder); else #endif sbuf_cat(sb, "des@freebsd.org"); } /* * Filler function for proc/version */ static int linprocfs_doversion(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s version %s (", osname, osrelease); linprocfs_osbuilder(td, sb); sbuf_cat(sb, ") (gcc version " __VERSION__ ") "); linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/loadavg */ static int linprocfs_doloadavg(PFS_FILL_ARGS) { sbuf_printf(sb, "%d.%02d %d.%02d %d.%02d %d/%d %d\n", (int)(averunnable.ldavg[0] / averunnable.fscale), (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[1] / averunnable.fscale), (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[2] / averunnable.fscale), (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), 1, /* number of running tasks */ nprocs, /* number of tasks */ lastpid /* the last pid */ ); return (0); } /* * Filler function for proc/pid/stat */ static int linprocfs_doprocstat(PFS_FILL_ARGS) { struct kinfo_proc kp; char state; static int ratelimit = 0; PROC_LOCK(p); fill_kinfo_proc(p, &kp); sbuf_printf(sb, "%d", p->p_pid); #define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg) PS_ADD("comm", "(%s)", p->p_comm); if (kp.ki_stat > sizeof(linux_state)) { state = 'R'; if (ratelimit == 0) { printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n", kp.ki_stat, sizeof(linux_state)); ++ratelimit; } } else state = linux_state[kp.ki_stat - 1]; PS_ADD("state", "%c", state); PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0); PS_ADD("pgrp", "%d", p->p_pgid); PS_ADD("session", "%d", p->p_session->s_sid); PROC_UNLOCK(p); PS_ADD("tty", "%d", 0); /* XXX */ PS_ADD("tpgid", "%d", kp.ki_tpgid); PS_ADD("flags", "%u", 0); /* XXX */ PS_ADD("minflt", "%lu", kp.ki_rusage.ru_minflt); PS_ADD("cminflt", "%lu", kp.ki_rusage_ch.ru_minflt); PS_ADD("majflt", "%lu", kp.ki_rusage.ru_majflt); PS_ADD("cmajflt", "%lu", kp.ki_rusage_ch.ru_majflt); PS_ADD("utime", "%ld", T2J(tvtohz(&kp.ki_rusage.ru_utime))); PS_ADD("stime", "%ld", T2J(tvtohz(&kp.ki_rusage.ru_stime))); PS_ADD("cutime", "%ld", T2J(tvtohz(&kp.ki_rusage_ch.ru_utime))); PS_ADD("cstime", "%ld", T2J(tvtohz(&kp.ki_rusage_ch.ru_stime))); PS_ADD("priority", "%d", kp.ki_pri.pri_user); PS_ADD("nice", "%d", kp.ki_nice); /* 19 (nicest) to -19 */ PS_ADD("0", "%d", 0); /* removed field */ PS_ADD("itrealvalue", "%d", 0); /* XXX */ /* XXX: starttime is not right, it is the _same_ for _every_ process. It should be the number of jiffies between system boot and process start. */ PS_ADD("starttime", "%lu", T2J(tvtohz(&kp.ki_start))); PS_ADD("vsize", "%ju", P2K((uintmax_t)kp.ki_size)); PS_ADD("rss", "%ju", (uintmax_t)kp.ki_rssize); PS_ADD("rlim", "%lu", kp.ki_rusage.ru_maxrss); PS_ADD("startcode", "%u", (unsigned)0); PS_ADD("endcode", "%u", 0); /* XXX */ PS_ADD("startstack", "%u", 0); /* XXX */ PS_ADD("kstkesp", "%u", 0); /* XXX */ PS_ADD("kstkeip", "%u", 0); /* XXX */ PS_ADD("signal", "%u", 0); /* XXX */ PS_ADD("blocked", "%u", 0); /* XXX */ PS_ADD("sigignore", "%u", 0); /* XXX */ PS_ADD("sigcatch", "%u", 0); /* XXX */ PS_ADD("wchan", "%u", 0); /* XXX */ PS_ADD("nswap", "%lu", kp.ki_rusage.ru_nswap); PS_ADD("cnswap", "%lu", kp.ki_rusage_ch.ru_nswap); PS_ADD("exitsignal", "%d", 0); /* XXX */ PS_ADD("processor", "%u", kp.ki_lastcpu); PS_ADD("rt_priority", "%u", 0); /* XXX */ /* >= 2.5.19 */ PS_ADD("policy", "%u", kp.ki_pri.pri_class); /* >= 2.5.19 */ #undef PS_ADD sbuf_putc(sb, '\n'); return (0); } /* * Filler function for proc/pid/statm */ static int linprocfs_doprocstatm(PFS_FILL_ARGS) { struct kinfo_proc kp; segsz_t lsize; PROC_LOCK(p); fill_kinfo_proc(p, &kp); PROC_UNLOCK(p); /* * See comments in linprocfs_doprocstatus() regarding the * computation of lsize. */ /* size resident share trs drs lrs dt */ sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size)); sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize); sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */ sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_tsize); sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "%ju ", (uintmax_t)lsize); sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */ return (0); } /* * Filler function for proc/pid/status */ static int linprocfs_doprocstatus(PFS_FILL_ARGS) { struct kinfo_proc kp; char *state; segsz_t lsize; struct thread *td2; struct sigacts *ps; int i; PROC_LOCK(p); td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */ if (P_SHOULDSTOP(p)) { state = "T (stopped)"; } else { PROC_SLOCK(p); switch(p->p_state) { case PRS_NEW: state = "I (idle)"; break; case PRS_NORMAL: if (p->p_flag & P_WEXIT) { state = "X (exiting)"; break; } switch(td2->td_state) { case TDS_INHIBITED: state = "S (sleeping)"; break; case TDS_RUNQ: case TDS_RUNNING: state = "R (running)"; break; default: state = "? (unknown)"; break; } break; case PRS_ZOMBIE: state = "Z (zombie)"; break; default: state = "? (unknown)"; break; } PROC_SUNLOCK(p); } fill_kinfo_proc(p, &kp); sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */ sbuf_printf(sb, "State:\t%s\n", state); /* * Credentials */ sbuf_printf(sb, "Pid:\t%d\n", p->p_pid); sbuf_printf(sb, "PPid:\t%d\n", p->p_pptr ? p->p_pptr->p_pid : 0); sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid, p->p_ucred->cr_uid, p->p_ucred->cr_svuid, /* FreeBSD doesn't have fsuid */ p->p_ucred->cr_uid); sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid, p->p_ucred->cr_gid, p->p_ucred->cr_svgid, /* FreeBSD doesn't have fsgid */ p->p_ucred->cr_gid); sbuf_cat(sb, "Groups:\t"); for (i = 0; i < p->p_ucred->cr_ngroups; i++) sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]); PROC_UNLOCK(p); sbuf_putc(sb, '\n'); /* * Memory * * While our approximation of VmLib may not be accurate (I * don't know of a simple way to verify it, and I'm not sure * it has much meaning anyway), I believe it's good enough. * * The same code that could (I think) accurately compute VmLib * could also compute VmLck, but I don't really care enough to * implement it. Submissions are welcome. */ sbuf_printf(sb, "VmSize:\t%8ju kB\n", B2K((uintmax_t)kp.ki_size)); sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */ sbuf_printf(sb, "VmRss:\t%8ju kB\n", P2K((uintmax_t)kp.ki_rssize)); sbuf_printf(sb, "VmData:\t%8ju kB\n", P2K((uintmax_t)kp.ki_dsize)); sbuf_printf(sb, "VmStk:\t%8ju kB\n", P2K((uintmax_t)kp.ki_ssize)); sbuf_printf(sb, "VmExe:\t%8ju kB\n", P2K((uintmax_t)kp.ki_tsize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "VmLib:\t%8ju kB\n", P2K((uintmax_t)lsize)); /* * Signal masks * * We support up to 128 signals, while Linux supports 32, * but we only define 32 (the same 32 as Linux, to boot), so * just show the lower 32 bits of each mask. XXX hack. * * NB: on certain platforms (Sparc at least) Linux actually * supports 64 signals, but this code is a long way from * running on anything but i386, so ignore that for now. */ PROC_LOCK(p); sbuf_printf(sb, "SigPnd:\t%08x\n", p->p_siglist.__bits[0]); /* * I can't seem to find out where the signal mask is in * relation to struct proc, so SigBlk is left unimplemented. */ sbuf_printf(sb, "SigBlk:\t%08x\n", 0); /* XXX */ ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sbuf_printf(sb, "SigIgn:\t%08x\n", ps->ps_sigignore.__bits[0]); sbuf_printf(sb, "SigCgt:\t%08x\n", ps->ps_sigcatch.__bits[0]); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); /* * Linux also prints the capability masks, but we don't have * capabilities yet, and when we do get them they're likely to * be meaningless to Linux programs, so we lie. XXX */ sbuf_printf(sb, "CapInh:\t%016x\n", 0); sbuf_printf(sb, "CapPrm:\t%016x\n", 0); sbuf_printf(sb, "CapEff:\t%016x\n", 0); return (0); } /* * Filler function for proc/pid/cwd */ static int linprocfs_doproccwd(PFS_FILL_ARGS) { char *fullpath = "unknown"; char *freepath = NULL; vn_fullpath(td, p->p_fd->fd_cdir, &fullpath, &freepath); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/pid/root */ static int linprocfs_doprocroot(PFS_FILL_ARGS) { struct vnode *rvp; char *fullpath = "unknown"; char *freepath = NULL; rvp = jailed(p->p_ucred) ? p->p_fd->fd_jdir : p->p_fd->fd_rdir; vn_fullpath(td, rvp, &fullpath, &freepath); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/pid/cmdline */ static int linprocfs_doproccmdline(PFS_FILL_ARGS) { struct ps_strings pstr; char **ps_argvstr; int error, i; /* * If we are using the ps/cmdline caching, use that. Otherwise * revert back to the old way which only implements full cmdline * for the currept process and just p->p_comm for all other * processes. * Note that if the argv is no longer available, we deliberately * don't fall back on p->p_comm or return an error: the authentic * Linux behaviour is to return zero-length in this case. */ PROC_LOCK(p); if (p->p_args && p_cansee(td, p) == 0) { sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length); PROC_UNLOCK(p); } else if (p != td->td_proc) { PROC_UNLOCK(p); sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm); } else { PROC_UNLOCK(p); error = copyin((void *)p->p_sysent->sv_psstrings, &pstr, sizeof(pstr)); if (error) return (error); if (pstr.ps_nargvstr > ARG_MAX) return (E2BIG); ps_argvstr = malloc(pstr.ps_nargvstr * sizeof(char *), M_TEMP, M_WAITOK); error = copyin((void *)pstr.ps_argvstr, ps_argvstr, pstr.ps_nargvstr * sizeof(char *)); if (error) { free(ps_argvstr, M_TEMP); return (error); } for (i = 0; i < pstr.ps_nargvstr; i++) { sbuf_copyin(sb, ps_argvstr[i], 0); sbuf_printf(sb, "%c", '\0'); } free(ps_argvstr, M_TEMP); } return (0); } /* * Filler function for proc/pid/environ */ static int linprocfs_doprocenviron(PFS_FILL_ARGS) { sbuf_printf(sb, "doprocenviron\n%c", '\0'); return (0); } /* * Filler function for proc/pid/maps */ static int linprocfs_doprocmaps(PFS_FILL_ARGS) { char mebuffer[512]; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry, tmp_entry; vm_object_t obj, tobj, lobj; vm_offset_t saved_end; vm_ooffset_t off = 0; char *name = "", *freename = NULL; size_t len; ino_t ino; unsigned int last_timestamp; int ref_count, shadow_count, flags; int error; struct vnode *vp; struct vattr vat; int locked; PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); if (uio->uio_offset != 0) return (0); error = 0; vm_map_lock_read(map); for (entry = map->header.next; ((uio->uio_resid > 0) && (entry != &map->header)); entry = entry->next) { name = ""; freename = NULL; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; saved_end = entry->end; obj = entry->object.vm_object; for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { VM_OBJECT_LOCK(tobj); if (lobj != obj) VM_OBJECT_UNLOCK(lobj); lobj = tobj; } ino = 0; if (lobj) { off = IDX_TO_OFF(lobj->size); if (lobj->type == OBJT_VNODE) { vp = lobj->handle; if (vp) vref(vp); } else vp = NULL; if (lobj != obj) VM_OBJECT_UNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_UNLOCK(obj); if (vp) { vn_fullpath(td, vp, &name, &freename); locked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); VOP_GETATTR(vp, &vat, td->td_ucred, td); ino = vat.va_fileid; vput(vp); VFS_UNLOCK_GIANT(locked); } } else { flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, access, offset, major, minor, inode, name. */ snprintf(mebuffer, sizeof mebuffer, "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n", (u_long)entry->start, (u_long)entry->end, (entry->protection & VM_PROT_READ)?"r":"-", (entry->protection & VM_PROT_WRITE)?"w":"-", (entry->protection & VM_PROT_EXECUTE)?"x":"-", "p", (u_long)off, 0, 0, (u_long)ino, *name ? " " : "", name ); if (freename) free(freename, M_TEMP); len = strlen(mebuffer); if (len > uio->uio_resid) len = uio->uio_resid; /* * XXX We should probably return * EFBIG here, as in procfs. */ last_timestamp = map->timestamp; vm_map_unlock_read(map); error = uiomove(mebuffer, len, uio); vm_map_lock_read(map); if (error) break; if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, saved_end - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); return (error); } /* * Filler function for proc/net/dev */ static int linprocfs_donetdev(PFS_FILL_ARGS) { char ifname[16]; /* XXX LINUX_IFNAMSIZ */ struct ifnet *ifp; sbuf_printf(sb, "%6s|%58s|%s\n%6s|%58s|%58s\n", "Inter-", " Receive", " Transmit", " face", "bytes packets errs drop fifo frame compressed", "bytes packets errs drop fifo frame compressed"); IFNET_RLOCK(); TAILQ_FOREACH(ifp, &ifnet, if_link) { linux_ifname(ifp, ifname, sizeof ifname); sbuf_printf(sb, "%6.6s:", ifname); sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ", 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); } IFNET_RUNLOCK(); return (0); } /* * Filler function for proc/sys/kernel/osrelease */ static int linprocfs_doosrelease(PFS_FILL_ARGS) { char osrelease[LINUX_MAX_UTSNAME]; linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s\n", osrelease); return (0); } /* * Filler function for proc/sys/kernel/ostype */ static int linprocfs_doostype(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); sbuf_printf(sb, "%s\n", osname); return (0); } /* * Filler function for proc/sys/kernel/version */ static int linprocfs_doosbuild(PFS_FILL_ARGS) { linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/sys/kernel/msgmni */ static int linprocfs_domsgmni(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", msginfo.msgmni); return (0); } /* * Filler function for proc/sys/kernel/pid_max */ static int linprocfs_dopid_max(PFS_FILL_ARGS) { sbuf_printf(sb, "%i\n", PID_MAX); return (0); } /* * Filler function for proc/sys/kernel/sem */ static int linprocfs_dosem(PFS_FILL_ARGS) { sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns, seminfo.semopm, seminfo.semmni); return (0); } /* * Filler function for proc/scsi/device_info */ static int linprocfs_doscsidevinfo(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/scsi/scsi */ static int linprocfs_doscsiscsi(PFS_FILL_ARGS) { return (0); } extern struct cdevsw *cdevsw[]; /* * Filler function for proc/devices */ static int linprocfs_dodevices(PFS_FILL_ARGS) { char *char_devices; sbuf_printf(sb, "Character devices:\n"); char_devices = linux_get_char_devices(); sbuf_printf(sb, "%s", char_devices); linux_free_get_char_devices(char_devices); sbuf_printf(sb, "\nBlock devices:\n"); return (0); } /* * Filler function for proc/cmdline */ static int linprocfs_docmdline(PFS_FILL_ARGS) { sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname); sbuf_printf(sb, " ro root=302\n"); return (0); } #if 0 /* * Filler function for proc/modules */ static int linprocfs_domodules(PFS_FILL_ARGS) { struct linker_file *lf; TAILQ_FOREACH(lf, &linker_files, link) { sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename, (unsigned long)lf->size, lf->refs); } return (0); } #endif /* * Constructor */ static int linprocfs_init(PFS_INIT_ARGS) { struct pfs_node *root; struct pfs_node *dir; root = pi->pi_root; /* /proc/... */ pfs_create_file(root, "cmdline", &linprocfs_docmdline, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "devices", &linprocfs_dodevices, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "loadavg", &linprocfs_doloadavg, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "meminfo", &linprocfs_domeminfo, NULL, NULL, NULL, PFS_RD); #if 0 pfs_create_file(root, "modules", &linprocfs_domodules, NULL, NULL, NULL, PFS_RD); #endif pfs_create_file(root, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "mtab", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_link(root, "self", &procfs_docurproc, NULL, NULL, NULL, 0); pfs_create_file(root, "stat", &linprocfs_dostat, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "uptime", &linprocfs_douptime, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "version", &linprocfs_doversion, NULL, NULL, NULL, PFS_RD); /* /proc/net/... */ dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0); pfs_create_file(dir, "dev", &linprocfs_donetdev, NULL, NULL, NULL, PFS_RD); /* /proc//... */ dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP); pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "cwd", &linprocfs_doproccwd, NULL, NULL, NULL, 0); pfs_create_file(dir, "environ", &linprocfs_doprocenviron, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "exe", &procfs_doprocfile, NULL, &procfs_notsystem, NULL, 0); pfs_create_file(dir, "maps", &linprocfs_doprocmaps, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "mem", &procfs_doprocmem, &procfs_attr, &procfs_candebug, NULL, PFS_RDWR|PFS_RAW); pfs_create_link(dir, "root", &linprocfs_doprocroot, NULL, NULL, NULL, 0); pfs_create_file(dir, "stat", &linprocfs_doprocstat, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "statm", &linprocfs_doprocstatm, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "status", &linprocfs_doprocstatus, NULL, NULL, NULL, PFS_RD); /* /proc/scsi/... */ dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0); pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi, NULL, NULL, NULL, PFS_RD); /* /proc/sys/... */ dir = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0); /* /proc/sys/kernel/... */ dir = pfs_create_dir(dir, "kernel", NULL, NULL, NULL, 0); pfs_create_file(dir, "osrelease", &linprocfs_doosrelease, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "ostype", &linprocfs_doostype, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "version", &linprocfs_doosbuild, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "msgmni", &linprocfs_domsgmni, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "pid_max", &linprocfs_dopid_max, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "sem", &linprocfs_dosem, NULL, NULL, NULL, PFS_RD); return (0); } /* * Destructor */ static int linprocfs_uninit(PFS_INIT_ARGS) { /* nothing to do, pseudofs will GC */ return (0); } PSEUDOFS(linprocfs, 1); MODULE_DEPEND(linprocfs, linux, 1, 1, 1); MODULE_DEPEND(linprocfs, procfs, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1); Index: head/sys/compat/linux/linux_file.c =================================================================== --- head/sys/compat/linux/linux_file.c (revision 175201) +++ head/sys/compat/linux/linux_file.c (revision 175202) @@ -1,1353 +1,1353 @@ /*- * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include int linux_creat(struct thread *td, struct linux_creat_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(creat)) printf(ARGS(creat, "%s, %d"), path, args->mode); #endif error = kern_open(td, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); LFREEPATH(path); return (error); } static int linux_common_open(struct thread *td, char *path, int l_flags, int mode, int openat) { struct proc *p = td->td_proc; struct file *fp; int fd; int bsd_flags, error; bsd_flags = 0; switch (l_flags & LINUX_O_ACCMODE) { case LINUX_O_WRONLY: bsd_flags |= O_WRONLY; break; case LINUX_O_RDWR: bsd_flags |= O_RDWR; break; default: bsd_flags |= O_RDONLY; } if (l_flags & LINUX_O_NDELAY) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_O_APPEND) bsd_flags |= O_APPEND; if (l_flags & LINUX_O_SYNC) bsd_flags |= O_FSYNC; if (l_flags & LINUX_O_NONBLOCK) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_FASYNC) bsd_flags |= O_ASYNC; if (l_flags & LINUX_O_CREAT) bsd_flags |= O_CREAT; if (l_flags & LINUX_O_TRUNC) bsd_flags |= O_TRUNC; if (l_flags & LINUX_O_EXCL) bsd_flags |= O_EXCL; if (l_flags & LINUX_O_NOCTTY) bsd_flags |= O_NOCTTY; if (l_flags & LINUX_O_DIRECT) bsd_flags |= O_DIRECT; if (l_flags & LINUX_O_NOFOLLOW) bsd_flags |= O_NOFOLLOW; /* XXX LINUX_O_NOATIME: unable to be easily implemented. */ error = kern_open(td, path, UIO_SYSSPACE, bsd_flags, mode); if (!error) { fd = td->td_retval[0]; /* * XXX In between kern_open() and fget(), another process * having the same filedesc could use that fd without * checking below. */ error = fget(td, fd, &fp); if (!error) { sx_slock(&proctree_lock); PROC_LOCK(p); if (!(bsd_flags & O_NOCTTY) && SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); sx_unlock(&proctree_lock); if (fp->f_type == DTYPE_VNODE) (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); } else { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); } if (l_flags & LINUX_O_DIRECTORY) { if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VDIR) { error = ENOTDIR; } } fdrop(fp, td); /* * XXX as above, fdrop()/kern_close() pair is racy. */ if (error) kern_close(td, fd); } } #ifdef DEBUG if (ldebug(open)) printf(LMSG("open returns error %d"), error); #endif if (!openat) LFREEPATH(path); return error; } /* * common code for linux *at set of syscalls * * works like this: * if filename is absolute * ignore dirfd * else * if dirfd == AT_FDCWD * return CWD/filename * else * return DIRFD/filename */ static int linux_at(struct thread *td, int dirfd, char *filename, char **newpath, char **freebuf) { struct file *fp; int error = 0, vfslocked; struct vnode *dvp; struct filedesc *fdp = td->td_proc->p_fd; char *fullpath = "unknown"; char *freepath = NULL; /* don't do anything if the pathname is absolute */ if (*filename == '/') { *newpath= filename; return (0); } /* check for AT_FDWCD */ if (dirfd == LINUX_AT_FDCWD) { FILEDESC_SLOCK(fdp); dvp = fdp->fd_cdir; vref(dvp); FILEDESC_SUNLOCK(fdp); } else { error = fget(td, dirfd, &fp); if (error) return (error); dvp = fp->f_vnode; /* only a dir can be dfd */ if (dvp->v_type != VDIR) { fdrop(fp, td); return (ENOTDIR); } vref(dvp); fdrop(fp, td); } /* * XXXRW: This is bogus, as vn_fullpath() returns only an advisory * file path, and may fail in several common situations, including * for file systmes that don't use the name cache, and if the entry * for the file falls out of the name cache. We should implement * openat() in the FreeBSD native system call layer properly (using a * requested starting directory), and have Linux and other ABIs wrap * the native implementation. */ error = vn_fullpath(td, dvp, &fullpath, &freepath); if (!error) { *newpath = malloc(strlen(fullpath) + strlen(filename) + 2, M_TEMP, M_WAITOK | M_ZERO); *freebuf = freepath; sprintf(*newpath, "%s/%s", fullpath, filename); } else { *newpath = NULL; } vfslocked = VFS_LOCK_GIANT(dvp->v_mount); vrele(dvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } int linux_openat(struct thread *td, struct linux_openat_args *args) { char *newpath, *oldpath, *freebuf, *path; int error; oldpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(args->filename, oldpath, MAXPATHLEN, NULL); if (error) { free(oldpath, M_TEMP); return (error); } #ifdef DEBUG if (ldebug(openat)) printf(ARGS(openat, "%i, %s, 0x%x, 0x%x"), args->dfd, oldpath, args->flags, args->mode); #endif newpath = freebuf = NULL; error = linux_at(td, args->dfd, oldpath, &newpath, &freebuf); if (error == 0) { #ifdef DEBUG if (ldebug(openat)) printf(LMSG("newpath: %s"), newpath); #endif if (args->flags & LINUX_O_CREAT) LCONVPATH_SEG(td, newpath, &path, 1, UIO_SYSSPACE); else LCONVPATH_SEG(td, newpath, &path, 0, UIO_SYSSPACE); } if (freebuf) free(freebuf, M_TEMP); if (*oldpath != '/') free(newpath, M_TEMP); if (error == 0) { error = linux_common_open(td, path, args->flags, args->mode, 1); LFREEPATH(path); } free(oldpath, M_TEMP); return (error); } int linux_open(struct thread *td, struct linux_open_args *args) { char *path; if (args->flags & LINUX_O_CREAT) LCONVPATHCREAT(td, args->path, &path); else LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(open)) printf(ARGS(open, "%s, 0x%x, 0x%x"), path, args->flags, args->mode); #endif return linux_common_open(td, path, args->flags, args->mode, 0); } int linux_lseek(struct thread *td, struct linux_lseek_args *args) { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ tmp_args; int error; #ifdef DEBUG if (ldebug(lseek)) printf(ARGS(lseek, "%d, %ld, %d"), args->fdes, (long)args->off, args->whence); #endif tmp_args.fd = args->fdes; tmp_args.offset = (off_t)args->off; tmp_args.whence = args->whence; error = lseek(td, &tmp_args); return error; } int linux_llseek(struct thread *td, struct linux_llseek_args *args) { struct lseek_args bsd_args; int error; off_t off; #ifdef DEBUG if (ldebug(llseek)) printf(ARGS(llseek, "%d, %d:%d, %d"), args->fd, args->ohigh, args->olow, args->whence); #endif off = (args->olow) | (((off_t) args->ohigh) << 32); bsd_args.fd = args->fd; bsd_args.offset = off; bsd_args.whence = args->whence; if ((error = lseek(td, &bsd_args))) return error; if ((error = copyout(td->td_retval, args->res, sizeof (off_t)))) return error; td->td_retval[0] = 0; return 0; } int linux_readdir(struct thread *td, struct linux_readdir_args *args) { struct linux_getdents_args lda; lda.fd = args->fd; lda.dent = args->dent; lda.count = 1; return linux_getdents(td, &lda); } /* * Note that linux_getdents(2) and linux_getdents64(2) have the same * arguments. They only differ in the definition of struct dirent they * operate on. We use this to common the code, with the exception of * accessing struct dirent. Note that linux_readdir(2) is implemented * by means of linux_getdents(2). In this case we never operate on * struct dirent64 and thus don't need to handle it... */ struct l_dirent { l_long d_ino; l_off_t d_off; l_ushort d_reclen; char d_name[LINUX_NAME_MAX + 1]; }; struct l_dirent64 { uint64_t d_ino; int64_t d_off; l_ushort d_reclen; u_char d_type; char d_name[LINUX_NAME_MAX + 1]; }; #define LINUX_RECLEN(de,namlen) \ ALIGN((((char *)&(de)->d_name - (char *)de) + (namlen) + 1)) #define LINUX_DIRBLKSIZ 512 static int getdents_common(struct thread *td, struct linux_getdents64_args *args, int is64bit) { struct dirent *bdp; struct vnode *vp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* Linux-format */ int resid, linuxreclen=0; /* Linux-format */ struct file *fp; struct uio auio; struct iovec aiov; off_t off; struct l_dirent linux_dirent; struct l_dirent64 linux_dirent64; int buflen, error, eofflag, nbytes, justone; u_long *cookies = NULL, *cookiep; int ncookies, vfslocked; nbytes = args->count; if (nbytes == 1) { /* readdir(2) case. Always struct dirent. */ if (is64bit) return (EINVAL); nbytes = sizeof(linux_dirent); justone = 1; } else justone = 0; if ((error = getvnode(td->td_proc->p_fd, args->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } off = fp->f_offset; buflen = max(LINUX_DIRBLKSIZ, nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } #ifdef MAC /* * Do directory search MAC check using non-cached credentials. */ if ((error = mac_vnode_check_readdir(td->td_ucred, vp))) goto out; #endif /* MAC */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies))) goto out; inp = buf; outp = (caddr_t)args->dirent; resid = nbytes; if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { bdp = (struct dirent *) inp; len -= bdp->d_reclen; inp += bdp->d_reclen; cookiep++; ncookies--; } } while (len > 0) { if (cookiep && ncookies == 0) break; bdp = (struct dirent *) inp; reclen = bdp->d_reclen; if (reclen & 3) { error = EFAULT; goto out; } if (bdp->d_fileno == 0) { inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; len -= reclen; continue; } linuxreclen = (is64bit) ? LINUX_RECLEN(&linux_dirent64, bdp->d_namlen) : LINUX_RECLEN(&linux_dirent, bdp->d_namlen); if (reclen > len || resid < linuxreclen) { outp++; break; } if (justone) { /* readdir(2) case. */ linux_dirent.d_ino = (l_long)bdp->d_fileno; linux_dirent.d_off = (l_off_t)linuxreclen; linux_dirent.d_reclen = (l_ushort)bdp->d_namlen; strcpy(linux_dirent.d_name, bdp->d_name); error = copyout(&linux_dirent, outp, linuxreclen); } else { if (is64bit) { linux_dirent64.d_ino = bdp->d_fileno; linux_dirent64.d_off = (cookiep) ? (l_off_t)*cookiep : (l_off_t)(off + reclen); linux_dirent64.d_reclen = (l_ushort)linuxreclen; linux_dirent64.d_type = bdp->d_type; strcpy(linux_dirent64.d_name, bdp->d_name); error = copyout(&linux_dirent64, outp, linuxreclen); } else { linux_dirent.d_ino = bdp->d_fileno; linux_dirent.d_off = (cookiep) ? (l_off_t)*cookiep : (l_off_t)(off + reclen); linux_dirent.d_reclen = (l_ushort)linuxreclen; strcpy(linux_dirent.d_name, bdp->d_name); error = copyout(&linux_dirent, outp, linuxreclen); } } if (error) goto out; inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; outp += linuxreclen; resid -= linuxreclen; len -= reclen; if (justone) break; } if (outp == (caddr_t)args->dirent) goto again; fp->f_offset = off; if (justone) nbytes = resid + linuxreclen; eof: td->td_retval[0] = nbytes - resid; out: if (cookies) free(cookies, M_TEMP); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); free(buf, M_TEMP); return (error); } int linux_getdents(struct thread *td, struct linux_getdents_args *args) { #ifdef DEBUG if (ldebug(getdents)) printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count); #endif return (getdents_common(td, (struct linux_getdents64_args*)args, 0)); } int linux_getdents64(struct thread *td, struct linux_getdents64_args *args) { #ifdef DEBUG if (ldebug(getdents64)) printf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count); #endif return (getdents_common(td, args, 1)); } /* * These exist mainly for hooks for doing /compat/linux translation. */ int linux_access(struct thread *td, struct linux_access_args *args) { char *path; int error; /* linux convention */ if (args->flags & ~(F_OK | X_OK | W_OK | R_OK)) return (EINVAL); LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(access)) printf(ARGS(access, "%s, %d"), path, args->flags); #endif error = kern_access(td, path, UIO_SYSSPACE, args->flags); LFREEPATH(path); return (error); } int linux_unlink(struct thread *td, struct linux_unlink_args *args) { char *path; int error; struct stat st; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(unlink)) printf(ARGS(unlink, "%s"), path); #endif error = kern_unlink(td, path, UIO_SYSSPACE); if (error == EPERM) /* Introduce POSIX noncompliant behaviour of Linux */ if (kern_stat(td, path, UIO_SYSSPACE, &st) == 0) if (S_ISDIR(st.st_mode)) error = EISDIR; LFREEPATH(path); return (error); } int linux_chdir(struct thread *td, struct linux_chdir_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(chdir)) printf(ARGS(chdir, "%s"), path); #endif error = kern_chdir(td, path, UIO_SYSSPACE); LFREEPATH(path); return (error); } int linux_chmod(struct thread *td, struct linux_chmod_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(chmod)) printf(ARGS(chmod, "%s, %d"), path, args->mode); #endif error = kern_chmod(td, path, UIO_SYSSPACE, args->mode); LFREEPATH(path); return (error); } int linux_mkdir(struct thread *td, struct linux_mkdir_args *args) { char *path; int error; LCONVPATHCREAT(td, args->path, &path); #ifdef DEBUG if (ldebug(mkdir)) printf(ARGS(mkdir, "%s, %d"), path, args->mode); #endif error = kern_mkdir(td, path, UIO_SYSSPACE, args->mode); LFREEPATH(path); return (error); } int linux_rmdir(struct thread *td, struct linux_rmdir_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(rmdir)) printf(ARGS(rmdir, "%s"), path); #endif error = kern_rmdir(td, path, UIO_SYSSPACE); LFREEPATH(path); return (error); } int linux_rename(struct thread *td, struct linux_rename_args *args) { char *from, *to; int error; LCONVPATHEXIST(td, args->from, &from); /* Expand LCONVPATHCREATE so that `from' can be freed on errors */ error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1); if (to == NULL) { LFREEPATH(from); return (error); } #ifdef DEBUG if (ldebug(rename)) printf(ARGS(rename, "%s, %s"), from, to); #endif error = kern_rename(td, from, to, UIO_SYSSPACE); LFREEPATH(from); LFREEPATH(to); return (error); } int linux_symlink(struct thread *td, struct linux_symlink_args *args) { char *path, *to; int error; LCONVPATHEXIST(td, args->path, &path); /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1); if (to == NULL) { LFREEPATH(path); return (error); } #ifdef DEBUG if (ldebug(symlink)) printf(ARGS(symlink, "%s, %s"), path, to); #endif error = kern_symlink(td, path, to, UIO_SYSSPACE); LFREEPATH(path); LFREEPATH(to); return (error); } int linux_readlink(struct thread *td, struct linux_readlink_args *args) { char *name; int error; LCONVPATHEXIST(td, args->name, &name); #ifdef DEBUG if (ldebug(readlink)) printf(ARGS(readlink, "%s, %p, %d"), name, (void *)args->buf, args->count); #endif error = kern_readlink(td, name, UIO_SYSSPACE, args->buf, UIO_USERSPACE, args->count); LFREEPATH(name); return (error); } int linux_truncate(struct thread *td, struct linux_truncate_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(truncate)) printf(ARGS(truncate, "%s, %ld"), path, (long)args->length); #endif error = kern_truncate(td, path, UIO_SYSSPACE, args->length); LFREEPATH(path); return (error); } int linux_ftruncate(struct thread *td, struct linux_ftruncate_args *args) { struct ftruncate_args /* { int fd; int pad; off_t length; } */ nuap; nuap.fd = args->fd; nuap.length = args->length; return (ftruncate(td, &nuap)); } int linux_link(struct thread *td, struct linux_link_args *args) { char *path, *to; int error; LCONVPATHEXIST(td, args->path, &path); /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1); if (to == NULL) { LFREEPATH(path); return (error); } #ifdef DEBUG if (ldebug(link)) printf(ARGS(link, "%s, %s"), path, to); #endif error = kern_link(td, path, to, UIO_SYSSPACE); LFREEPATH(path); LFREEPATH(to); return (error); } int linux_fdatasync(td, uap) struct thread *td; struct linux_fdatasync_args *uap; { struct fsync_args bsd; bsd.fd = uap->fd; return fsync(td, &bsd); } int linux_pread(td, uap) struct thread *td; struct linux_pread_args *uap; { struct pread_args bsd; struct vnode *vp; int error; bsd.fd = uap->fd; bsd.buf = uap->buf; bsd.nbyte = uap->nbyte; bsd.offset = uap->offset; error = pread(td, &bsd); if (error == 0) { /* This seems to violate POSIX but linux does it */ if ((error = fgetvp(td, uap->fd, &vp)) != 0) return (error); if (vp->v_type == VDIR) { vrele(vp); return (EISDIR); } vrele(vp); } return (error); } int linux_pwrite(td, uap) struct thread *td; struct linux_pwrite_args *uap; { struct pwrite_args bsd; bsd.fd = uap->fd; bsd.buf = uap->buf; bsd.nbyte = uap->nbyte; bsd.offset = uap->offset; return pwrite(td, &bsd); } int linux_mount(struct thread *td, struct linux_mount_args *args) { struct ufs_args ufs; char fstypename[MFSNAMELEN]; char mntonname[MNAMELEN], mntfromname[MNAMELEN]; int error; int fsflags; void *fsdata; error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1, NULL); if (error) return (error); error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL); if (error) return (error); error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL); if (error) return (error); #ifdef DEBUG if (ldebug(mount)) printf(ARGS(mount, "%s, %s, %s"), fstypename, mntfromname, mntonname); #endif if (strcmp(fstypename, "ext2") == 0) { strcpy(fstypename, "ext2fs"); fsdata = &ufs; ufs.fspec = mntfromname; #define DEFAULT_ROOTID -2 ufs.export.ex_root = DEFAULT_ROOTID; ufs.export.ex_flags = args->rwflag & LINUX_MS_RDONLY ? MNT_EXRDONLY : 0; } else if (strcmp(fstypename, "proc") == 0) { strcpy(fstypename, "linprocfs"); fsdata = NULL; } else { return (ENODEV); } fsflags = 0; if ((args->rwflag & 0xffff0000) == 0xc0ed0000) { /* * Linux SYNC flag is not included; the closest equivalent * FreeBSD has is !ASYNC, which is our default. */ if (args->rwflag & LINUX_MS_RDONLY) fsflags |= MNT_RDONLY; if (args->rwflag & LINUX_MS_NOSUID) fsflags |= MNT_NOSUID; if (args->rwflag & LINUX_MS_NOEXEC) fsflags |= MNT_NOEXEC; if (args->rwflag & LINUX_MS_REMOUNT) fsflags |= MNT_UPDATE; } if (strcmp(fstypename, "linprocfs") == 0) { error = kernel_vmount(fsflags, "fstype", fstypename, "fspath", mntonname, NULL); } else error = EOPNOTSUPP; return (error); } int linux_oldumount(struct thread *td, struct linux_oldumount_args *args) { struct linux_umount_args args2; args2.path = args->path; args2.flags = 0; return (linux_umount(td, &args2)); } int linux_umount(struct thread *td, struct linux_umount_args *args) { struct unmount_args bsd; bsd.path = args->path; bsd.flags = args->flags; /* XXX correct? */ return (unmount(td, &bsd)); } /* * fcntl family of syscalls */ struct l_flock { l_short l_type; l_short l_whence; l_off_t l_start; l_off_t l_len; l_pid_t l_pid; } #if defined(__amd64__) && defined(COMPAT_LINUX32) __packed #endif ; static void linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; } static void bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_off_t)bsd_flock->l_start; linux_flock->l_len = (l_off_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct l_flock64 { l_short l_type; l_short l_whence; l_loff_t l_start; l_loff_t l_len; l_pid_t l_pid; } #if defined(__amd64__) && defined(COMPAT_LINUX32) __packed #endif ; static void linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; } static void bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_loff_t)bsd_flock->l_start; linux_flock->l_len = (l_loff_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ static int fcntl_common(struct thread *td, struct linux_fcntl64_args *args) { struct l_flock linux_flock; struct flock bsd_flock; struct file *fp; long arg; int error, result; switch (args->cmd) { case LINUX_F_DUPFD: return (kern_fcntl(td, args->fd, F_DUPFD, args->arg)); case LINUX_F_GETFD: return (kern_fcntl(td, args->fd, F_GETFD, 0)); case LINUX_F_SETFD: return (kern_fcntl(td, args->fd, F_SETFD, args->arg)); case LINUX_F_GETFL: error = kern_fcntl(td, args->fd, F_GETFL, 0); result = td->td_retval[0]; td->td_retval[0] = 0; if (result & O_RDONLY) td->td_retval[0] |= LINUX_O_RDONLY; if (result & O_WRONLY) td->td_retval[0] |= LINUX_O_WRONLY; if (result & O_RDWR) td->td_retval[0] |= LINUX_O_RDWR; if (result & O_NDELAY) td->td_retval[0] |= LINUX_O_NONBLOCK; if (result & O_APPEND) td->td_retval[0] |= LINUX_O_APPEND; if (result & O_FSYNC) td->td_retval[0] |= LINUX_O_SYNC; if (result & O_ASYNC) td->td_retval[0] |= LINUX_FASYNC; #ifdef LINUX_O_NOFOLLOW if (result & O_NOFOLLOW) td->td_retval[0] |= LINUX_O_NOFOLLOW; #endif #ifdef LINUX_O_DIRECT if (result & O_DIRECT) td->td_retval[0] |= LINUX_O_DIRECT; #endif return (error); case LINUX_F_SETFL: arg = 0; if (args->arg & LINUX_O_NDELAY) arg |= O_NONBLOCK; if (args->arg & LINUX_O_APPEND) arg |= O_APPEND; if (args->arg & LINUX_O_SYNC) arg |= O_FSYNC; if (args->arg & LINUX_FASYNC) arg |= O_ASYNC; #ifdef LINUX_O_NOFOLLOW if (args->arg & LINUX_O_NOFOLLOW) arg |= O_NOFOLLOW; #endif #ifdef LINUX_O_DIRECT if (args->arg & LINUX_O_DIRECT) arg |= O_DIRECT; #endif return (kern_fcntl(td, args->fd, F_SETFL, arg)); case LINUX_F_GETLK: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock); if (error) return (error); bsd_to_linux_flock(&bsd_flock, &linux_flock); return (copyout(&linux_flock, (void *)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLK, (intptr_t)&bsd_flock)); case LINUX_F_SETLKW: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLKW, (intptr_t)&bsd_flock)); case LINUX_F_GETOWN: return (kern_fcntl(td, args->fd, F_GETOWN, 0)); case LINUX_F_SETOWN: /* * XXX some Linux applications depend on F_SETOWN having no * significant effect for pipes (SIGIO is not delivered for * pipes under Linux-2.2.35 at least). */ error = fget(td, args->fd, &fp); if (error) return (error); if (fp->f_type == DTYPE_PIPE) { fdrop(fp, td); return (EINVAL); } fdrop(fp, td); return (kern_fcntl(td, args->fd, F_SETOWN, args->arg)); } return (EINVAL); } int linux_fcntl(struct thread *td, struct linux_fcntl_args *args) { struct linux_fcntl64_args args64; #ifdef DEBUG if (ldebug(fcntl)) printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd); #endif args64.fd = args->fd; args64.cmd = args->cmd; args64.arg = args->arg; return (fcntl_common(td, &args64)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args) { struct l_flock64 linux_flock; struct flock bsd_flock; int error; #ifdef DEBUG if (ldebug(fcntl64)) printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd); #endif switch (args->cmd) { case LINUX_F_GETLK64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock); if (error) return (error); bsd_to_linux_flock64(&bsd_flock, &linux_flock); return (copyout(&linux_flock, (void *)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLK, (intptr_t)&bsd_flock)); case LINUX_F_SETLKW64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLKW, (intptr_t)&bsd_flock)); } return (fcntl_common(td, args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_chown(struct thread *td, struct linux_chown_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(chown)) printf(ARGS(chown, "%s, %d, %d"), path, args->uid, args->gid); #endif error = kern_chown(td, path, UIO_SYSSPACE, args->uid, args->gid); LFREEPATH(path); return (error); } int linux_lchown(struct thread *td, struct linux_lchown_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(lchown)) printf(ARGS(lchown, "%s, %d, %d"), path, args->uid, args->gid); #endif error = kern_lchown(td, path, UIO_SYSSPACE, args->uid, args->gid); LFREEPATH(path); return (error); } Index: head/sys/compat/linux/linux_getcwd.c =================================================================== --- head/sys/compat/linux/linux_getcwd.c (revision 175201) +++ head/sys/compat/linux/linux_getcwd.c (revision 175202) @@ -1,477 +1,477 @@ /* $OpenBSD: linux_getcwd.c,v 1.2 2001/05/16 12:50:21 ho Exp $ */ /* $NetBSD: vfs_getcwd.c,v 1.3.2.3 1999/07/11 10:24:09 sommerfeld Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Bill Sommerfeld. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX only for DIRBLKSIZ */ #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include static int linux_getcwd_scandir(struct vnode **, struct vnode **, char **, char *, struct thread *); static int linux_getcwd_common(struct vnode *, struct vnode *, char **, char *, int, int, struct thread *); #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) /* * Vnode variable naming conventions in this file: * * rvp: the current root we're aiming towards. * lvp, *lvpp: the "lower" vnode * uvp, *uvpp: the "upper" vnode. * * Since all the vnodes we're dealing with are directories, and the * lookups are going *up* in the filesystem rather than *down*, the * usual "pvp" (parent) or "dvp" (directory) naming conventions are * too confusing. */ /* * XXX Will infinite loop in certain cases if a directory read reliably * returns EINVAL on last block. * XXX is EINVAL the right thing to return if a directory is malformed? */ /* * XXX Untested vs. mount -o union; probably does the wrong thing. */ /* * Find parent vnode of *lvpp, return in *uvpp * * If we care about the name, scan it looking for name of directory * entry pointing at lvp. * * Place the name in the buffer which starts at bufp, immediately * before *bpp, and move bpp backwards to point at the start of it. * * On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed * On exit, *uvpp is either NULL or is a locked vnode reference. */ static int linux_getcwd_scandir(lvpp, uvpp, bpp, bufp, td) struct vnode **lvpp; struct vnode **uvpp; char **bpp; char *bufp; struct thread *td; { int error = 0; int eofflag; off_t off; int tries; struct uio uio; struct iovec iov; char *dirbuf = NULL; int dirbuflen; ino_t fileno; struct vattr va; struct vnode *uvp = NULL; struct vnode *lvp = *lvpp; struct componentname cn; int len, reclen; tries = 0; /* * If we want the filename, get some info we need while the * current directory is still locked. */ if (bufp != NULL) { error = VOP_GETATTR(lvp, &va, td->td_ucred, td); if (error) { vput(lvp); *lvpp = NULL; *uvpp = NULL; return error; } } /* * Ok, we have to do it the hard way.. * Next, get parent vnode using lookup of .. */ cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY; cn.cn_thread = td; cn.cn_cred = td->td_ucred; cn.cn_pnbuf = NULL; cn.cn_nameptr = ".."; cn.cn_namelen = 2; cn.cn_consume = 0; cn.cn_lkflags = LK_EXCLUSIVE; /* * At this point, lvp is locked and will be unlocked by the lookup. * On successful return, *uvpp will be locked */ #ifdef MAC error = mac_vnode_check_lookup(td->td_ucred, lvp, &cn); if (error == 0) #endif error = VOP_LOOKUP(lvp, uvpp, &cn); if (error) { vput(lvp); *lvpp = NULL; *uvpp = NULL; return error; } uvp = *uvpp; /* If we don't care about the pathname, we're done */ if (bufp == NULL) { vput(lvp); *lvpp = NULL; return 0; } fileno = va.va_fileid; dirbuflen = DIRBLKSIZ; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); #if 0 unionread: #endif off = 0; do { /* call VOP_READDIR of parent */ iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = off; uio.uio_resid = dirbuflen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; eofflag = 0; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, uvp); if (error == 0) #endif /* MAC */ error = VOP_READDIR(uvp, &uio, td->td_ucred, &eofflag, 0, 0); off = uio.uio_offset; /* * Try again if NFS tosses its cookies. * XXX this can still loop forever if the directory is busted * such that the second or subsequent page of it always * returns EINVAL */ if ((error == EINVAL) && (tries < 3)) { off = 0; tries++; continue; /* once more, with feeling */ } if (!error) { char *cpos; struct dirent *dp; cpos = dirbuf; tries = 0; /* scan directory page looking for matching vnode */ for (len = (dirbuflen - uio.uio_resid); len > 0; len -= reclen) { dp = (struct dirent *) cpos; reclen = dp->d_reclen; /* check for malformed directory.. */ if (reclen < DIRENT_MINSIZE) { error = EINVAL; goto out; } /* * XXX should perhaps do VOP_LOOKUP to * check that we got back to the right place, * but getting the locking games for that * right would be heinous. */ if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { char *bp = *bpp; bp -= dp->d_namlen; if (bp <= bufp) { error = ERANGE; goto out; } bcopy(dp->d_name, bp, dp->d_namlen); error = 0; *bpp = bp; goto out; } cpos += reclen; } } } while (!eofflag); error = ENOENT; out: vput(lvp); *lvpp = NULL; free(dirbuf, M_TEMP); return error; } /* * common routine shared by sys___getcwd() and linux_vn_isunder() */ #define GETCWD_CHECK_ACCESS 0x0001 static int linux_getcwd_common (lvp, rvp, bpp, bufp, limit, flags, td) struct vnode *lvp; struct vnode *rvp; char **bpp; char *bufp; int limit; int flags; struct thread *td; { struct filedesc *fdp = td->td_proc->p_fd; struct vnode *uvp = NULL; char *bp = NULL; int error; int perms = VEXEC; if (rvp == NULL) { rvp = fdp->fd_rdir; if (rvp == NULL) rvp = rootvnode; } VREF(rvp); VREF(lvp); /* * Error handling invariant: * Before a `goto out': * lvp is either NULL, or locked and held. * uvp is either NULL, or locked and held. */ - error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td); + error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) panic("vn_lock LK_RETRY returned error %d", error); if (bufp) bp = *bpp; /* * this loop will terminate when one of the following happens: * - we hit the root * - getdirentries or lookup fails * - we run out of space in the buffer. */ if (lvp == rvp) { if (bp) *(--bp) = '/'; goto out; } do { if (lvp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * access check here is optional, depending on * whether or not caller cares. */ if (flags & GETCWD_CHECK_ACCESS) { error = VOP_ACCESS(lvp, perms, td->td_ucred, td); if (error) goto out; perms = VEXEC|VREAD; } /* * step up if we're a covered vnode.. */ while (lvp->v_vflag & VV_ROOT) { struct vnode *tvp; if (lvp == rvp) goto out; tvp = lvp; lvp = lvp->v_mount->mnt_vnodecovered; vput(tvp); /* * hodie natus est radici frater */ if (lvp == NULL) { error = ENOENT; goto out; } VREF(lvp); - error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td); + error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) panic("vn_lock LK_RETRY returned %d", error); } error = linux_getcwd_scandir(&lvp, &uvp, &bp, bufp, td); if (error) goto out; #ifdef DIAGNOSTIC if (lvp != NULL) panic("getcwd: oops, forgot to null lvp"); if (bufp && (bp <= bufp)) { panic("getcwd: oops, went back too far"); } #endif if (bp) *(--bp) = '/'; lvp = uvp; uvp = NULL; limit--; } while ((lvp != rvp) && (limit > 0)); out: if (bpp) *bpp = bp; if (uvp) vput(uvp); if (lvp) vput(lvp); vrele(rvp); return error; } /* * Find pathname of process's current directory. * * Use vfs vnode-to-name reverse cache; if that fails, fall back * to reading directory contents. */ int linux_getcwd(struct thread *td, struct linux_getcwd_args *args) { caddr_t bp, bend, path; int error, len, lenused; #ifdef DEBUG if (ldebug(getcwd)) printf(ARGS(getcwd, "%p, %ld"), args->buf, (long)args->bufsize); #endif len = args->bufsize; if (len > MAXPATHLEN*4) len = MAXPATHLEN*4; else if (len < 2) return ERANGE; path = (char *)malloc(len, M_TEMP, M_WAITOK); error = kern___getcwd(td, path, UIO_SYSSPACE, len); if (!error) { lenused = strlen(path) + 1; if (lenused <= args->bufsize) { td->td_retval[0] = lenused; error = copyout(path, args->buf, lenused); } else error = ERANGE; } else { bp = &path[len]; bend = bp; *(--bp) = '\0'; /* * 5th argument here is "max number of vnodes to traverse". * Since each entry takes up at least 2 bytes in the output buffer, * limit it to N/2 vnodes for an N byte buffer. */ mtx_lock(&Giant); error = linux_getcwd_common (td->td_proc->p_fd->fd_cdir, NULL, &bp, path, len/2, GETCWD_CHECK_ACCESS, td); mtx_unlock(&Giant); if (error) goto out; lenused = bend - bp; td->td_retval[0] = lenused; /* put the result into user buffer */ error = copyout(bp, args->buf, lenused); } out: free(path, M_TEMP); return (error); } Index: head/sys/compat/opensolaris/kern/opensolaris_kobj.c =================================================================== --- head/sys/compat/opensolaris/kern/opensolaris_kobj.c (revision 175201) +++ head/sys/compat/opensolaris/kern/opensolaris_kobj.c (revision 175202) @@ -1,220 +1,220 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include void kobj_free(void *address, size_t size) { kmem_free(address, size); } void * kobj_alloc(size_t size, int flag) { return (kmem_alloc(size, (flag & KM_NOWAIT) ? KM_NOSLEEP : KM_SLEEP)); } void * kobj_zalloc(size_t size, int flag) { void *p; if ((p = kobj_alloc(size, flag)) != NULL) bzero(p, size); return (p); } static void * kobj_open_file_vnode(const char *file) { struct thread *td = curthread; struct nameidata nd; int error, flags; if (td->td_proc->p_fd->fd_rdir == NULL) td->td_proc->p_fd->fd_rdir = rootvnode; if (td->td_proc->p_fd->fd_cdir == NULL) td->td_proc->p_fd->fd_cdir = rootvnode; flags = FREAD; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td); error = vn_open_cred(&nd, &flags, 0, td->td_ucred, NULL); NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) return (NULL); /* We just unlock so we hold a reference. */ VOP_UNLOCK(nd.ni_vp, 0, td); return (nd.ni_vp); } static void * kobj_open_file_loader(const char *file) { return (preload_search_by_name(file)); } struct _buf * kobj_open_file(const char *file) { struct _buf *out; out = kmem_alloc(sizeof(*out), KM_SLEEP); out->mounted = root_mounted(); /* * If root is already mounted we read file using file system, * if not, we use loader. */ if (out->mounted) out->ptr = kobj_open_file_vnode(file); else out->ptr = kobj_open_file_loader(file); if (out->ptr == NULL) { kmem_free(out, sizeof(*out)); return ((struct _buf *)-1); } return (out); } static int kobj_get_filesize_vnode(struct _buf *file, uint64_t *size) { struct vnode *vp = file->ptr; struct thread *td = curthread; struct vattr va; int error; - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); if (error == 0) *size = (uint64_t)va.va_size; return (error); } static int kobj_get_filesize_loader(struct _buf *file, uint64_t *size) { void *ptr; ptr = preload_search_info(file->ptr, MODINFO_SIZE); if (ptr == NULL) return (ENOENT); *size = (uint64_t)*(size_t *)ptr; return (0); } int kobj_get_filesize(struct _buf *file, uint64_t *size) { if (file->mounted) return (kobj_get_filesize_vnode(file, size)); else return (kobj_get_filesize_loader(file, size)); } int kobj_read_file_vnode(struct _buf *file, char *buf, unsigned size, unsigned off) { struct vnode *vp = file->ptr; struct thread *td = curthread; struct uio auio; struct iovec aiov; int error; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); aiov.iov_base = buf; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_offset = (off_t)off; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_iovcnt = 1; auio.uio_resid = size; auio.uio_td = td; - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_READ(vp, &auio, IO_UNIT | IO_SYNC, td->td_ucred); VOP_UNLOCK(vp, 0, td); return (error != 0 ? -1 : size - auio.uio_resid); } int kobj_read_file_loader(struct _buf *file, char *buf, unsigned size, unsigned off) { char *ptr; ptr = preload_search_info(file->ptr, MODINFO_ADDR); if (ptr == NULL) return (ENOENT); ptr = *(void **)ptr; bcopy(ptr + off, buf, size); return (0); } int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) { if (file->mounted) return (kobj_read_file_vnode(file, buf, size, off)); else return (kobj_read_file_loader(file, buf, size, off)); } void kobj_close_file(struct _buf *file) { if (file->mounted) { struct vnode *vp = file->ptr; struct thread *td = curthread; int flags = FREAD; vn_close(vp, flags, td->td_ucred, td); } kmem_free(file, sizeof(*file)); } Index: head/sys/compat/opensolaris/kern/opensolaris_vfs.c =================================================================== --- head/sys/compat/opensolaris/kern/opensolaris_vfs.c (revision 175201) +++ head/sys/compat/opensolaris/kern/opensolaris_vfs.c (revision 175202) @@ -1,280 +1,280 @@ /*- * Copyright (c) 2006-2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include MALLOC_DECLARE(M_MOUNT); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, int flags __unused) { struct vfsopt *opt; size_t namesize; if (vfsp->mnt_opt == NULL) { vfsp->mnt_opt = malloc(sizeof(*vfsp->mnt_opt), M_MOUNT, M_WAITOK); TAILQ_INIT(vfsp->mnt_opt); } opt = malloc(sizeof(*opt), M_MOUNT, M_WAITOK); namesize = strlen(name) + 1; opt->name = malloc(namesize, M_MOUNT, M_WAITOK); strlcpy(opt->name, name, namesize); if (arg == NULL) { opt->value = NULL; opt->len = 0; } else { opt->len = strlen(arg) + 1; opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(arg, opt->value, opt->len); } /* TODO: Locking. */ TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); } void vfs_clearmntopt(vfs_t *vfsp, const char *name) { struct vfsopt *opt; if (vfsp->mnt_opt == NULL) return; /* TODO: Locking. */ TAILQ_FOREACH(opt, vfsp->mnt_opt, link) { if (strcmp(opt->name, name) == 0) break; } if (opt != NULL) { TAILQ_REMOVE(vfsp->mnt_opt, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } } int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) { struct vfsoptlist *opts = vfsp->mnt_opt; int error; if (opts == NULL) return (0); error = vfs_getopt(opts, opt, (void **)argp, NULL); return (error != 0 ? 0 : 1); } int traverse(vnode_t **cvpp, int lktype) { kthread_t *td = curthread; vnode_t *cvp; vnode_t *tvp; vfs_t *vfsp; int error; cvp = *cvpp; tvp = NULL; /* * If this vnode is mounted on, then we transparently indirect * to the vnode which is the root of the mounted file system. * Before we do this we must check that an unmount is not in * progress on this vnode. */ for (;;) { /* * Reached the end of the mount chain? */ vfsp = vn_mountedvfs(cvp); if (vfsp == NULL) break; /* * tvp is NULL for *cvpp vnode, which we can't unlock. */ if (tvp != NULL) vput(cvp); else vrele(cvp); /* * The read lock must be held across the call to VFS_ROOT() to * prevent a concurrent unmount from destroying the vfs. */ error = VFS_ROOT(vfsp, lktype, &tvp, td); if (error != 0) return (error); cvp = tvp; } *cvpp = cvp; return (0); } int domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath, char *fspec, int fsflags) { struct mount *mp; struct vfsconf *vfsp; struct ucred *newcr, *oldcr; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (vp->v_type != VDIR) return (ENOTDIR); VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); /* * Allocate and initialize the filesystem. */ - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); mp = vfs_mount_alloc(vp, vfsp, fspath, td); VOP_UNLOCK(vp, 0, td); mp->mnt_optnew = NULL; vfs_setmntopt(mp, "from", fspec, 0); mp->mnt_optnew = mp->mnt_opt; mp->mnt_opt = NULL; /* * Set the mount level flags. * crdup() can sleep, so do it before acquiring a mutex. */ newcr = crdup(kcred); MNT_ILOCK(mp); if (fsflags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; mp->mnt_flag &=~ MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS); /* * Unprivileged user can trigger mounting a snapshot, but we don't want * him to unmount it, so we switch to privileged credentials. */ oldcr = mp->mnt_cred; mp->mnt_cred = newcr; mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid; MNT_IUNLOCK(mp); crfree(oldcr); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, td); if (!error) { if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; (void)VFS_STATFS(mp, &mp->mnt_stat, td); } /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Put the new filesystem on the mount list after root. */ #ifdef FREEBSD_NAMECACHE cache_purge(vp); #endif if (!error) { vnode_t *mvp; VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp, td)) panic("mount: lost mount"); mountcheckdirs(vp, mvp); vput(mvp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if (error) vrele(vp); else vfs_mountedfrom(mp, fspec); } else { VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0, td); vfs_unbusy(mp, td); vfs_mount_destroy(mp); } return (error); } Index: head/sys/compat/opensolaris/sys/vnode.h =================================================================== --- head/sys/compat/opensolaris/sys/vnode.h (revision 175201) +++ head/sys/compat/opensolaris/sys/vnode.h (revision 175202) @@ -1,268 +1,268 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_VNODE_H_ #define _OPENSOLARIS_SYS_VNODE_H_ #include_next #include #include #include #include #include #include #include typedef struct vnode vnode_t; typedef struct vattr vattr_t; typedef void caller_context_t; typedef struct vop_vector vnodeops_t; #define vop_fid vop_vptofh #define vop_fid_args vop_vptofh_args #define a_fid a_fhp #define v_count v_usecount static __inline int vn_is_readonly(vnode_t *vp) { return (vp->v_mount->mnt_flag & MNT_RDONLY); } #define vn_vfswlock(vp) (0) #define vn_vfsunlock(vp) do { } while (0) #define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) #define vn_mountedvfs(vp) ((vp)->v_mountedhere) #define vn_has_cached_data(vp) ((vp)->v_object != NULL && (vp)->v_object->resident_page_count > 0) #define VN_HOLD(v) vref(v) #define VN_RELE(v) vrele(v) #define VN_URELE(v) vput(v) #define VOP_REALVP(vp, vpp) (*(vpp) = (vp), 0) #define vnevent_remove(vp) do { } while (0) #define vnevent_rmdir(vp) do { } while (0) #define vnevent_rename_src(vp) do { } while (0) #define vnevent_rename_dest(vp) do { } while (0) #define IS_DEVVP(vp) \ ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) #define MODEMASK ALLPERMS #define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) #define MANDMODE(mode) (0) #define chklock(vp, op, offset, size, mode, ct) (0) #define cleanlocks(vp, pid, foo) do { } while (0) #define cleanshares(vp, pid) do { } while (0) /* * We will use va_spare is place of Solaris' va_mask. * This field is initialized in zfs_setattr(). */ #define va_mask va_spare /* TODO: va_fileid is shorter than va_nodeid !!! */ #define va_nodeid va_fileid /* TODO: This field needs conversion! */ #define va_nblocks va_bytes #define va_blksize va_blocksize #define va_seq va_gen #define MAXOFFSET_T OFF_MAX #define EXCL 0 #define AT_TYPE 0x0001 #define AT_MODE 0x0002 #define AT_UID 0x0004 #define AT_GID 0x0008 #define AT_FSID 0x0010 #define AT_NODEID 0x0020 #define AT_NLINK 0x0040 #define AT_SIZE 0x0080 #define AT_ATIME 0x0100 #define AT_MTIME 0x0200 #define AT_CTIME 0x0400 #define AT_RDEV 0x0800 #define AT_BLKSIZE 0x1000 #define AT_NBLOCKS 0x2000 #define AT_SEQ 0x4000 #define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) #define ACCESSED (AT_ATIME) #define STATE_CHANGED (AT_CTIME) #define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) static __inline void vattr_init_mask(vattr_t *vap) { vap->va_mask = 0; if (vap->va_type != VNON) vap->va_mask |= AT_TYPE; if (vap->va_uid != (uid_t)VNOVAL) vap->va_mask |= AT_UID; if (vap->va_gid != (gid_t)VNOVAL) vap->va_mask |= AT_GID; if (vap->va_size != (u_quad_t)VNOVAL) vap->va_mask |= AT_SIZE; if (vap->va_atime.tv_sec != VNOVAL) vap->va_mask |= AT_ATIME; if (vap->va_mtime.tv_sec != VNOVAL) vap->va_mask |= AT_MTIME; if (vap->va_mode != (u_short)VNOVAL) vap->va_mask |= AT_MODE; } #define FCREAT O_CREAT #define FTRUNC O_TRUNC #define FDSYNC FFSYNC #define FRSYNC FFSYNC #define FSYNC FFSYNC #define FOFFMAX 0x00 enum create { CRCREAT }; static __inline int zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, vnode_t **vpp, enum create crwhy, mode_t umask) { struct thread *td = curthread; struct nameidata nd; int error; ASSERT(seg == UIO_SYSSPACE); ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); ASSERT(crwhy == CRCREAT); ASSERT(umask == 0); if (td->td_proc->p_fd->fd_rdir == NULL) td->td_proc->p_fd->fd_rdir = rootvnode; if (td->td_proc->p_fd->fd_cdir == NULL) td->td_proc->p_fd->fd_cdir = rootvnode; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pnamep, td); error = vn_open_cred(&nd, &filemode, createmode, td->td_ucred, NULL); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == 0) { /* We just unlock so we hold a reference. */ VN_HOLD(nd.ni_vp); VOP_UNLOCK(nd.ni_vp, 0, td); *vpp = nd.ni_vp; } return (error); } #define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \ zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask)) #define RLIM64_INFINITY 0 static __inline int zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr, ssize_t *residp) { struct thread *td = curthread; int error, vfslocked, resid; ASSERT(rw == UIO_WRITE); ASSERT(ioflag == 0); ASSERT(ulimit == RLIM64_INFINITY); ioflag = IO_APPEND | IO_UNIT; vfslocked = VFS_LOCK_GIANT(vp->v_mount); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED, &resid, td); VFS_UNLOCK_GIANT(vfslocked); if (residp != NULL) *residp = (ssize_t)resid; return (error); } #define vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \ zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp)) static __inline int zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr) { struct thread *td = curthread; struct mount *mp; int error, vfslocked; ASSERT(flag == FSYNC); vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); return (error); } #define VOP_FSYNC(vp, flag, cr) zfs_vop_fsync((vp), (flag), (cr)) static __inline int zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { ASSERT(flag == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); ASSERT(count == 1); ASSERT(offset == 0); return (vn_close(vp, flag, cr, curthread)); } #define VOP_CLOSE(vp, oflags, count, offset, cr) \ zfs_vop_close((vp), (oflags), (count), (offset), (cr)) static __inline int vn_rename(char *from, char *to, enum uio_seg seg) { ASSERT(seg == UIO_SYSSPACE); return (kern_rename(curthread, from, to, seg)); } enum rm { RMFILE }; static __inline int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) { ASSERT(seg == UIO_SYSSPACE); ASSERT(dirflag == RMFILE); return (kern_unlink(curthread, fnamep, seg)); } #endif /* _OPENSOLARIS_SYS_VNODE_H_ */ Index: head/sys/compat/pecoff/imgact_pecoff.c =================================================================== --- head/sys/compat/pecoff/imgact_pecoff.c (revision 175201) +++ head/sys/compat/pecoff/imgact_pecoff.c (revision 175202) @@ -1,606 +1,606 @@ /*- * Copyright (c) 2000 Masaru OKI * Copyright (c) 1994, 1995, 1998 Scott Bartram * Copyright (c) 1994 Adam Glass * Copyright (c) 1993, 1994 Christopher G. Demetriou * * originally from NetBSD kern/exec_ecoff.c * * Copyright (c) 2000 Takanori Watanabe * Copyright (c) 2000 KUROSAWA Takahiro * Copyright (c) 1995-1996 Sen Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * originally from FreeBSD kern/imgact_elf.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Masaru OKI. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_pecoff.h" #define PECOFF_PE_SIGNATURE "PE\0\0" static int pecoff_fixup(register_t **, struct image_params *); #ifndef PECOFF_DEBUG #define DPRINTF(a) #else #define DPRINTF(a) printf a #endif static struct sysentvec pecoff_sysvec = { SYS_MAXSYSCALL, sysent, 0, 0, NULL, 0, NULL, NULL, pecoff_fixup, sendsig, sigcode, &szsigcode, 0, "FreeBSD PECoff", NULL, NULL, MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs, NULL }; static const char signature[] = PECOFF_PE_SIGNATURE; static int exec_pecoff_coff_prep_omagic(struct image_params *, struct coff_filehdr *, struct coff_aouthdr *, int peoffs); static int exec_pecoff_coff_prep_nmagic(struct image_params *, struct coff_filehdr *, struct coff_aouthdr *, int peoffs); static int exec_pecoff_coff_prep_zmagic(struct image_params *, struct coff_filehdr *, struct coff_aouthdr *, int peoffs); static int exec_pecoff_coff_makecmds(struct image_params *, struct coff_filehdr *, int); static int pecoff_signature(struct thread *, struct vnode *, const struct pecoff_dos_filehdr *); static int pecoff_read_from(struct thread *, struct vnode *, int, caddr_t, int); static int pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot); static int pecoff_fixup(register_t ** stack_base, struct image_params * imgp) { int len = sizeof(struct pecoff_args); struct pecoff_imghdr *ap; register_t *pos; pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2); ap = (struct pecoff_imghdr *) imgp->auxargs; if (copyout(ap, pos, len)) { return 0; } free(ap, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; suword(*stack_base, (long) imgp->args->argc); return 0; } static int pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) { size_t map_len; vm_offset_t map_addr; int error, rv; size_t copy_len; size_t copy_map_len; size_t copy_start; vm_object_t object; vm_offset_t copy_map_offset; vm_offset_t file_addr; vm_offset_t data_buf = 0; object = vp->v_object; error = 0; map_addr = trunc_page((vm_offset_t) vmaddr); file_addr = trunc_page(offset); DPRINTF(("SECARG:%x %p %x %x\n", offset, vmaddr, memsz, filsz)); if (file_addr != offset) { /* * The section is not on page boundary. We can't use * vm_map_insert(). Use copyin instead. */ map_len = round_page(memsz); copy_len = filsz; copy_map_offset = file_addr; copy_map_len = round_page(offset + filsz) - file_addr; copy_start = offset - file_addr; DPRINTF(("offset=%x vmaddr=%lx filsz=%x memsz=%x\n", offset, (long)vmaddr, filsz, memsz)); DPRINTF(("map_len=%x copy_len=%x copy_map_offset=%x" " copy_map_len=%x copy_start=%x\n", map_len, copy_len, copy_map_offset, copy_map_len, copy_start)); } else { map_len = trunc_page(filsz); if (map_len != 0) { vm_object_reference(object); vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len, /* virtual end */ prot, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); vm_map_unlock(&vmspace->vm_map); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); return EINVAL; } /* we can stop now if we've covered it all */ if (memsz == filsz) return 0; } copy_map_offset = trunc_page(offset + filsz); copy_map_len = PAGE_SIZE; copy_start = 0; copy_len = (offset + filsz) - trunc_page(offset + filsz); map_addr = trunc_page((vm_offset_t) vmaddr + filsz); map_len = round_page((vm_offset_t) vmaddr + memsz) - map_addr; } if (map_len != 0) { vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, NULL, 0, map_addr, map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(&vmspace->vm_map); DPRINTF(("EMP-rv:%d,%x %x\n", rv, map_addr, map_addr + map_len)); if (rv != KERN_SUCCESS) { return EINVAL; } } DPRINTF(("COPYARG %x %x\n", map_addr, copy_len)); if (copy_len != 0) { vm_object_reference(object); rv = vm_map_find(exec_map, object, copy_map_offset, &data_buf, copy_map_len, TRUE, VM_PROT_READ, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); return EINVAL; } /* send the page fragment to user space */ error = copyout((caddr_t) data_buf + copy_start, (caddr_t) map_addr, copy_len); vm_map_remove(exec_map, data_buf, data_buf + copy_map_len); DPRINTF(("%d\n", error)); if (error) return (error); } /* * set it to the specified protection */ vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot, FALSE); return error; } static int pecoff_load_file(struct thread * td, const char *file, u_long * addr, u_long * entry, u_long * ldexport) { struct nameidata nd; struct pecoff_dos_filehdr dh; struct coff_filehdr *fp = 0; struct coff_aouthdr *ap; struct pecoff_opthdr *wp; struct coff_scnhdr *sh = 0; struct vmspace *vmspace = td->td_proc->p_vmspace; struct vattr attr; struct image_params image_params, *imgp; int peofs; int error, i, scnsiz; imgp = &image_params; /* * Initialize part of the common data */ imgp->proc = td->td_proc; imgp->execlabel = NULL; imgp->attr = &attr; imgp->firstpage = NULL; NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, td); if ((error = namei(&nd)) != 0) { nd.ni_vp = NULL; goto fail; } NDFREE(&nd, NDF_ONLY_PNBUF); imgp->vp = nd.ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); VOP_UNLOCK(nd.ni_vp, 0, td); if (error) goto fail; if ((error = pecoff_read_from(td, imgp->vp, 0, (caddr_t) & dh, sizeof(dh))) != 0) goto fail; if ((error = pecoff_signature(td, imgp->vp, &dh) != 0)) goto fail; fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK); peofs = dh.d_peofs + sizeof(signature) - 1; if ((error = pecoff_read_from(td, imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE) != 0)) goto fail; if (COFF_BADMAG(fp)) { error = ENOEXEC; goto fail; } ap = (void *) ((char *) fp + sizeof(struct coff_filehdr)); wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr)); /* read section header */ scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns; sh = malloc(scnsiz, M_TEMP, M_WAITOK); if ((error = pecoff_read_from(td, imgp->vp, peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz)) != 0) goto fail; /* * Read Section infomation and map sections. */ for (i = 0; i < fp->f_nscns; i++) { int prot = 0; if (sh[i].s_flags & COFF_STYP_DISCARD) continue; /* XXX ? */ if ((sh[i].s_flags & COFF_STYP_TEXT) && (sh[i].s_flags & COFF_STYP_EXEC) == 0) continue; if ((sh[i].s_flags & (COFF_STYP_TEXT | COFF_STYP_DATA | COFF_STYP_BSS)) == 0) continue; prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0; prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0; prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0; sh[i].s_vaddr += wp->w_base; /* RVA --> VA */ if ((error = pecoff_load_section(td, vmspace, imgp->vp, sh[i].s_scnptr ,(caddr_t) sh[i].s_vaddr, sh[i].s_paddr, sh[i].s_size ,prot)) != 0) goto fail; } *entry = wp->w_base + ap->a_entry; *addr = wp->w_base; *ldexport = wp->w_imghdr[0].i_vaddr + wp->w_base; fail: if (fp) free(fp, M_TEMP); if (sh) free(sh, M_TEMP); if (nd.ni_vp) vrele(nd.ni_vp); return error; } static int exec_pecoff_coff_prep_omagic(struct image_params * imgp, struct coff_filehdr * fp, struct coff_aouthdr * ap, int peofs) { return ENOEXEC; } static int exec_pecoff_coff_prep_nmagic(struct image_params * imgp, struct coff_filehdr * fp, struct coff_aouthdr * ap, int peofs) { return ENOEXEC; } static int exec_pecoff_coff_prep_zmagic(struct image_params * imgp, struct coff_filehdr * fp, struct coff_aouthdr * ap, int peofs) { int scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns; int error = ENOEXEC, i; int prot; u_long text_size = 0, data_size = 0, dsize; u_long text_addr = 0, data_addr = VM_MAXUSER_ADDRESS; u_long ldexport = 0, ldbase = 0; struct pecoff_opthdr *wp; struct coff_scnhdr *sh; struct vmspace *vmspace; struct pecoff_args *argp = NULL; sh = malloc(scnsiz, M_TEMP, M_WAITOK); wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr)); error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp, peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz); if (error) return (error); error = exec_new_vmspace(imgp, &pecoff_sysvec); if (error) return (error); vmspace = imgp->proc->p_vmspace; for (i = 0; i < fp->f_nscns; i++) { prot = VM_PROT_WRITE; /* XXX for relocation? */ prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0; prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0; prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0; sh[i].s_vaddr += wp->w_base; if (sh[i].s_flags & COFF_STYP_DISCARD) continue; if ((sh[i].s_flags & COFF_STYP_TEXT) != 0) { error = pecoff_load_section( FIRST_THREAD_IN_PROC(imgp->proc), vmspace, imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr, sh[i].s_paddr, sh[i].s_size ,prot); DPRINTF(("ERROR%d\n", error)); if (error) goto fail; text_addr = trunc_page(sh[i].s_vaddr); text_size = trunc_page(sh[i].s_size + sh[i].s_vaddr - text_addr); } if ((sh[i].s_flags & (COFF_STYP_DATA|COFF_STYP_BSS)) != 0) { if (pecoff_load_section( FIRST_THREAD_IN_PROC(imgp->proc), vmspace, imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr, sh[i].s_paddr, sh[i].s_size, prot) != 0) goto fail; data_addr = min(trunc_page(sh[i].s_vaddr), data_addr); dsize = round_page(sh[i].s_vaddr + sh[i].s_paddr) - data_addr; data_size = max(dsize, data_size); } } vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t) (uintptr_t) data_addr; argp = malloc(sizeof(struct pecoff_args), M_TEMP, M_WAITOK); if (argp == NULL) { error = ENOMEM; goto fail; } argp->a_base = wp->w_base; argp->a_entry = wp->w_base + ap->a_entry; argp->a_end = data_addr + data_size; argp->a_subsystem = wp->w_subvers; error = pecoff_load_file(FIRST_THREAD_IN_PROC(imgp->proc), "/usr/libexec/ld.so.dll", &ldbase, &imgp->entry_addr, &ldexport); if (error) goto fail; argp->a_ldbase = ldbase; argp->a_ldexport = ldexport; memcpy(argp->a_imghdr, wp->w_imghdr, sizeof(struct pecoff_imghdr) * 16); for (i = 0; i < 16; i++) { argp->a_imghdr[i].i_vaddr += wp->w_base; } imgp->proc->p_sysent = &pecoff_sysvec; imgp->auxargs = argp; imgp->auxarg_size = sizeof(struct pecoff_args); imgp->interpreted = 0; if (sh != NULL) free(sh, M_TEMP); return 0; fail: error = (error) ? error : ENOEXEC; if (sh != NULL) free(sh, M_TEMP); if (argp != NULL) free(argp, M_TEMP); return error; } int exec_pecoff_coff_makecmds(struct image_params * imgp, struct coff_filehdr * fp, int peofs) { struct coff_aouthdr *ap; int error; if (COFF_BADMAG(fp)) { return ENOEXEC; } ap = (void *) ((char *) fp + sizeof(struct coff_filehdr)); switch (ap->a_magic) { case COFF_OMAGIC: error = exec_pecoff_coff_prep_omagic(imgp, fp, ap, peofs); break; case COFF_NMAGIC: error = exec_pecoff_coff_prep_nmagic(imgp, fp, ap, peofs); break; case COFF_ZMAGIC: error = exec_pecoff_coff_prep_zmagic(imgp, fp, ap, peofs); break; default: return ENOEXEC; } return error; } static int pecoff_signature(td, vp, dp) struct thread *td; struct vnode *vp; const struct pecoff_dos_filehdr *dp; { int error; char buf[512]; char *pesig; if (DOS_BADMAG(dp)) { return ENOEXEC; } error = pecoff_read_from(td, vp, dp->d_peofs, buf, sizeof(buf)); if (error) { return error; } pesig = buf; if (memcmp(pesig, signature, sizeof(signature) - 1) == 0) { return 0; } return EFTYPE; } int pecoff_read_from(td, vp, pos, buf, siz) struct thread *td; struct vnode *vp; int pos; caddr_t buf; int siz; { int error; size_t resid; error = vn_rdwr(UIO_READ, vp, buf, siz, pos, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) return error; if (resid != 0) { return ENOEXEC; } return 0; } static int imgact_pecoff(struct image_params * imgp) { const struct pecoff_dos_filehdr *dp = (const struct pecoff_dos_filehdr *) imgp->image_header; struct coff_filehdr *fp; int error, peofs; struct thread *td = curthread; error = pecoff_signature(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp, dp); if (error) { return -1; } VOP_UNLOCK(imgp->vp, 0, td); peofs = dp->d_peofs + sizeof(signature) - 1; fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK); error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE); if (error) goto fail; error = exec_pecoff_coff_makecmds(imgp, fp, peofs); fail: free(fp, M_TEMP); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); return error; } static struct execsw pecoff_execsw = {imgact_pecoff, "FreeBSD PEcoff"}; EXEC_SET(pecoff, pecoff_execsw); Index: head/sys/compat/svr4/imgact_svr4.c =================================================================== --- head/sys/compat/svr4/imgact_svr4.c (revision 175201) +++ head/sys/compat/svr4/imgact_svr4.c (revision 175202) @@ -1,243 +1,243 @@ /*- * Copyright (c) 1998 Mark Newton * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Based heavily on /sys/kern/imgact_aout.c which is: * Copyright (c) 1993, David Greenman * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_svr4_imgact(struct image_params *iparams); static int exec_svr4_imgact(imgp) struct image_params *imgp; { const struct exec *a_out = (const struct exec *) imgp->image_header; struct vmspace *vmspace; vm_offset_t vmaddr; unsigned long virtual_offset, file_offset; vm_offset_t buffer; unsigned long bss_size; int error; struct thread *td = curthread; if (((a_out->a_magic >> 16) & 0xff) != 0x64) return -1; /* * Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: virtual_offset = 0; file_offset = 1024; break; case 0314: virtual_offset = 4096; file_offset = 0; break; default: return (-1); } bss_size = round_page(a_out->a_bss); #ifdef DEBUG printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", (u_long)a_out->a_text, (u_long)a_out->a_data, bss_size); #endif /* * Check various fields in header for validity/bounds. */ if (a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); VOP_UNLOCK(imgp->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ error = exec_new_vmspace(imgp, &svr4_sysvec); if (error) goto fail; vmspace = imgp->proc->p_vmspace; /* * Check if file_offset page aligned,. * Currently we cannot handle misalinged file offsets, * and so we read in the entire image (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("imgact: Non page aligned binary %lu\n", file_offset); #endif /* * Map text+data+bss read/write/execute */ vmaddr = virtual_offset; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data + bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; error = vm_mmap(kernel_map, &buffer, round_page(a_out->a_text + a_out->a_data + file_offset), VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, imgp->vp, trunc_page(file_offset)); if (error) goto fail; error = copyout((caddr_t)(buffer + file_offset), (caddr_t)vmaddr, a_out->a_text + a_out->a_data); vm_map_remove(kernel_map, buffer, buffer + round_page(a_out->a_text + a_out->a_data + file_offset)); if (error) goto fail; /* * remove write enable on the 'text' part */ error = vm_map_protect(&vmspace->vm_map, vmaddr, vmaddr + a_out->a_text, VM_PROT_EXECUTE|VM_PROT_READ, TRUE); if (error) goto fail; } else { #ifdef DEBUG printf("imgact: Page aligned binary %lu\n", file_offset); #endif /* * Map text+data read/execute */ vmaddr = virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, imgp->vp, file_offset); if (error) goto fail; #ifdef DEBUG printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr, (u_long)a_out->a_text + a_out->a_data); #endif /* * allow read/write of data */ error = vm_map_protect(&vmspace->vm_map, vmaddr + a_out->a_text, vmaddr + a_out->a_text + a_out->a_data, VM_PROT_ALL, FALSE); if (error) goto fail; /* * Allocate anon demand-zeroed area for uninitialized data */ if (bss_size != 0) { vmaddr = virtual_offset + a_out->a_text + a_out->a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; #ifdef DEBUG printf("imgact: bssaddr=%08lx, length=%08lx\n", (u_long)vmaddr, bss_size); #endif } } /* Fill in process VM information */ vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT; vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)virtual_offset; vmspace->vm_daddr = (caddr_t)virtual_offset + a_out->a_text; /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &svr4_sysvec; fail: - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* * Tell kern_execve.c about it, with a little help from the linker. */ struct execsw svr4_execsw = { exec_svr4_imgact, "svr4 ELF" }; EXEC_SET(execsw_set, svr4_execsw); Index: head/sys/compat/svr4/svr4_fcntl.c =================================================================== --- head/sys/compat/svr4/svr4_fcntl.c (revision 175201) +++ head/sys/compat/svr4/svr4_fcntl.c (revision 175202) @@ -1,725 +1,725 @@ /*- * Copyright (c) 1998 Mark Newton * Copyright (c) 1994, 1997 Christos Zoulas. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christos Zoulas. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include /*#include */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int svr4_to_bsd_flags(int); static u_long svr4_to_bsd_cmd(u_long); static int fd_revoke(struct thread *, int); static int fd_truncate(struct thread *, int, struct flock *); static int bsd_to_svr4_flags(int); static void bsd_to_svr4_flock(struct flock *, struct svr4_flock *); static void svr4_to_bsd_flock(struct svr4_flock *, struct flock *); static void bsd_to_svr4_flock64(struct flock *, struct svr4_flock64 *); static void svr4_to_bsd_flock64(struct svr4_flock64 *, struct flock *); static u_long svr4_to_bsd_cmd(cmd) u_long cmd; { switch (cmd) { case SVR4_F_DUPFD: return F_DUPFD; case SVR4_F_GETFD: return F_GETFD; case SVR4_F_SETFD: return F_SETFD; case SVR4_F_GETFL: return F_GETFL; case SVR4_F_SETFL: return F_SETFL; case SVR4_F_GETLK: return F_GETLK; case SVR4_F_SETLK: return F_SETLK; case SVR4_F_SETLKW: return F_SETLKW; default: return -1; } } static int svr4_to_bsd_flags(l) int l; { int r = 0; r |= (l & SVR4_O_RDONLY) ? O_RDONLY : 0; r |= (l & SVR4_O_WRONLY) ? O_WRONLY : 0; r |= (l & SVR4_O_RDWR) ? O_RDWR : 0; r |= (l & SVR4_O_NDELAY) ? O_NONBLOCK : 0; r |= (l & SVR4_O_APPEND) ? O_APPEND : 0; r |= (l & SVR4_O_SYNC) ? O_FSYNC : 0; r |= (l & SVR4_O_NONBLOCK) ? O_NONBLOCK : 0; r |= (l & SVR4_O_PRIV) ? O_EXLOCK : 0; r |= (l & SVR4_O_CREAT) ? O_CREAT : 0; r |= (l & SVR4_O_TRUNC) ? O_TRUNC : 0; r |= (l & SVR4_O_EXCL) ? O_EXCL : 0; r |= (l & SVR4_O_NOCTTY) ? O_NOCTTY : 0; return r; } static int bsd_to_svr4_flags(l) int l; { int r = 0; r |= (l & O_RDONLY) ? SVR4_O_RDONLY : 0; r |= (l & O_WRONLY) ? SVR4_O_WRONLY : 0; r |= (l & O_RDWR) ? SVR4_O_RDWR : 0; r |= (l & O_NDELAY) ? SVR4_O_NONBLOCK : 0; r |= (l & O_APPEND) ? SVR4_O_APPEND : 0; r |= (l & O_FSYNC) ? SVR4_O_SYNC : 0; r |= (l & O_NONBLOCK) ? SVR4_O_NONBLOCK : 0; r |= (l & O_EXLOCK) ? SVR4_O_PRIV : 0; r |= (l & O_CREAT) ? SVR4_O_CREAT : 0; r |= (l & O_TRUNC) ? SVR4_O_TRUNC : 0; r |= (l & O_EXCL) ? SVR4_O_EXCL : 0; r |= (l & O_NOCTTY) ? SVR4_O_NOCTTY : 0; return r; } static void bsd_to_svr4_flock(iflp, oflp) struct flock *iflp; struct svr4_flock *oflp; { switch (iflp->l_type) { case F_RDLCK: oflp->l_type = SVR4_F_RDLCK; break; case F_WRLCK: oflp->l_type = SVR4_F_WRLCK; break; case F_UNLCK: oflp->l_type = SVR4_F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = (short) iflp->l_whence; oflp->l_start = (svr4_off_t) iflp->l_start; oflp->l_len = (svr4_off_t) iflp->l_len; oflp->l_sysid = 0; oflp->l_pid = (svr4_pid_t) iflp->l_pid; } static void svr4_to_bsd_flock(iflp, oflp) struct svr4_flock *iflp; struct flock *oflp; { switch (iflp->l_type) { case SVR4_F_RDLCK: oflp->l_type = F_RDLCK; break; case SVR4_F_WRLCK: oflp->l_type = F_WRLCK; break; case SVR4_F_UNLCK: oflp->l_type = F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = iflp->l_whence; oflp->l_start = (off_t) iflp->l_start; oflp->l_len = (off_t) iflp->l_len; oflp->l_pid = (pid_t) iflp->l_pid; } static void bsd_to_svr4_flock64(iflp, oflp) struct flock *iflp; struct svr4_flock64 *oflp; { switch (iflp->l_type) { case F_RDLCK: oflp->l_type = SVR4_F_RDLCK; break; case F_WRLCK: oflp->l_type = SVR4_F_WRLCK; break; case F_UNLCK: oflp->l_type = SVR4_F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = (short) iflp->l_whence; oflp->l_start = (svr4_off64_t) iflp->l_start; oflp->l_len = (svr4_off64_t) iflp->l_len; oflp->l_sysid = 0; oflp->l_pid = (svr4_pid_t) iflp->l_pid; } static void svr4_to_bsd_flock64(iflp, oflp) struct svr4_flock64 *iflp; struct flock *oflp; { switch (iflp->l_type) { case SVR4_F_RDLCK: oflp->l_type = F_RDLCK; break; case SVR4_F_WRLCK: oflp->l_type = F_WRLCK; break; case SVR4_F_UNLCK: oflp->l_type = F_UNLCK; break; default: oflp->l_type = -1; break; } oflp->l_whence = iflp->l_whence; oflp->l_start = (off_t) iflp->l_start; oflp->l_len = (off_t) iflp->l_len; oflp->l_pid = (pid_t) iflp->l_pid; } static int fd_revoke(td, fd) struct thread *td; int fd; { struct vnode *vp; struct mount *mp; struct vattr vattr; int error, *retval; retval = td->td_retval; if ((error = fgetvp(td, fd, &vp)) != 0) return (error); if (vp->v_type != VCHR && vp->v_type != VBLK) { error = EINVAL; goto out; } #ifdef MAC - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = mac_vnode_check_revoke(td->td_ucred, vp); VOP_UNLOCK(vp, 0, td); if (error) goto out; #endif if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred, td)) != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid && (error = priv_check(td, PRIV_VFS_ADMIN)) != 0) goto out; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); vn_finished_write(mp); out: vrele(vp); return error; } static int fd_truncate(td, fd, flp) struct thread *td; int fd; struct flock *flp; { off_t start, length; struct file *fp; struct vnode *vp; struct vattr vattr; int error, *retval; struct ftruncate_args ft; retval = td->td_retval; /* * We only support truncating the file. */ if ((error = fget(td, fd, &fp)) != 0) return (error); vp = fp->f_vnode; if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { fdrop(fp, td); return ESPIPE; } if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred, td)) != 0) { fdrop(fp, td); return error; } length = vattr.va_size; switch (flp->l_whence) { case SEEK_CUR: start = fp->f_offset + flp->l_start; break; case SEEK_END: start = flp->l_start + length; break; case SEEK_SET: start = flp->l_start; break; default: fdrop(fp, td); return EINVAL; } if (start + flp->l_len < length) { /* We don't support free'ing in the middle of the file */ fdrop(fp, td); return EINVAL; } ft.fd = fd; ft.length = start; error = ftruncate(td, &ft); fdrop(fp, td); return (error); } int svr4_sys_open(td, uap) register struct thread *td; struct svr4_sys_open_args *uap; { struct proc *p = td->td_proc; char *newpath; int bsd_flags, error, retval; CHECKALTEXIST(td, uap->path, &newpath); bsd_flags = svr4_to_bsd_flags(uap->flags); error = kern_open(td, newpath, UIO_SYSSPACE, bsd_flags, uap->mode); free(newpath, M_TEMP); if (error) { /* uprintf("svr4_open(%s, 0x%0x, 0%o): %d\n", uap->path, uap->flags, uap->mode, error);*/ return error; } retval = td->td_retval[0]; PROC_LOCK(p); if (!(bsd_flags & O_NOCTTY) && SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { #if defined(NOTYET) struct file *fp; error = fget(td, retval, &fp); PROC_UNLOCK(p); /* * we may have lost a race the above open() and * another thread issuing a close() */ if (error) return (EBADF); /* XXX: correct errno? */ /* ignore any error, just give it a try */ if (fp->f_type == DTYPE_VNODE) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); fdrop(fp, td); } else { PROC_UNLOCK(p); } #else } PROC_UNLOCK(p); #endif return error; } int svr4_sys_open64(td, uap) register struct thread *td; struct svr4_sys_open64_args *uap; { return svr4_sys_open(td, (struct svr4_sys_open_args *)uap); } int svr4_sys_creat(td, uap) register struct thread *td; struct svr4_sys_creat_args *uap; { char *newpath; int error; CHECKALTEXIST(td, uap->path, &newpath); error = kern_open(td, newpath, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode); free(newpath, M_TEMP); return (error); } int svr4_sys_creat64(td, uap) register struct thread *td; struct svr4_sys_creat64_args *uap; { return svr4_sys_creat(td, (struct svr4_sys_creat_args *)uap); } int svr4_sys_llseek(td, uap) register struct thread *td; struct svr4_sys_llseek_args *uap; { struct lseek_args ap; ap.fd = uap->fd; #if BYTE_ORDER == BIG_ENDIAN ap.offset = (((u_int64_t) uap->offset1) << 32) | uap->offset2; #else ap.offset = (((u_int64_t) uap->offset2) << 32) | uap->offset1; #endif ap.whence = uap->whence; return lseek(td, &ap); } int svr4_sys_access(td, uap) register struct thread *td; struct svr4_sys_access_args *uap; { char *newpath; int error; CHECKALTEXIST(td, uap->path, &newpath); error = kern_access(td, newpath, UIO_SYSSPACE, uap->flags); free(newpath, M_TEMP); return (error); } #if defined(NOTYET) int svr4_sys_pread(td, uap) register struct thread *td; struct svr4_sys_pread_args *uap; { struct pread_args pra; /* * Just translate the args structure and call the NetBSD * pread(2) system call (offset type is 64-bit in NetBSD). */ pra.fd = uap->fd; pra.buf = uap->buf; pra.nbyte = uap->nbyte; pra.offset = uap->off; return pread(td, &pra); } #endif #if defined(NOTYET) int svr4_sys_pread64(td, v, retval) register struct thread *td; void *v; register_t *retval; { struct svr4_sys_pread64_args *uap = v; struct sys_pread_args pra; /* * Just translate the args structure and call the NetBSD * pread(2) system call (offset type is 64-bit in NetBSD). */ pra.fd = uap->fd; pra.buf = uap->buf; pra.nbyte = uap->nbyte; pra.offset = uap->off; return (sys_pread(td, &pra, retval)); } #endif /* NOTYET */ #if defined(NOTYET) int svr4_sys_pwrite(td, uap) register struct thread *td; struct svr4_sys_pwrite_args *uap; { struct pwrite_args pwa; /* * Just translate the args structure and call the NetBSD * pwrite(2) system call (offset type is 64-bit in NetBSD). */ pwa.fd = uap->fd; pwa.buf = uap->buf; pwa.nbyte = uap->nbyte; pwa.offset = uap->off; return pwrite(td, &pwa); } #endif #if defined(NOTYET) int svr4_sys_pwrite64(td, v, retval) register struct thread *td; void *v; register_t *retval; { struct svr4_sys_pwrite64_args *uap = v; struct sys_pwrite_args pwa; /* * Just translate the args structure and call the NetBSD * pwrite(2) system call (offset type is 64-bit in NetBSD). */ pwa.fd = uap->fd; pwa.buf = uap->buf; pwa.nbyte = uap->nbyte; pwa.offset = uap->off; return (sys_pwrite(td, &pwa, retval)); } #endif /* NOTYET */ int svr4_sys_fcntl(td, uap) register struct thread *td; struct svr4_sys_fcntl_args *uap; { int cmd, error, *retval; retval = td->td_retval; cmd = svr4_to_bsd_cmd(uap->cmd); switch (cmd) { case F_DUPFD: case F_GETFD: case F_SETFD: return (kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg)); case F_GETFL: error = kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg); if (error) return (error); *retval = bsd_to_svr4_flags(*retval); return (error); case F_SETFL: { /* * we must save the O_ASYNC flag, as that is * handled by ioctl(_, I_SETSIG, _) emulation. */ int flags; DPRINTF(("Setting flags %p\n", uap->arg)); error = kern_fcntl(td, uap->fd, F_GETFL, 0); if (error) return (error); flags = *retval; flags &= O_ASYNC; flags |= svr4_to_bsd_flags((u_long) uap->arg); return (kern_fcntl(td, uap->fd, F_SETFL, flags)); } case F_GETLK: case F_SETLK: case F_SETLKW: { struct svr4_flock ifl; struct flock fl; error = copyin(uap->arg, &ifl, sizeof (ifl)); if (error) return (error); svr4_to_bsd_flock(&ifl, &fl); error = kern_fcntl(td, uap->fd, cmd, (intptr_t)&fl); if (error || cmd != F_GETLK) return (error); bsd_to_svr4_flock(&fl, &ifl); return (copyout(&ifl, uap->arg, sizeof (ifl))); } case -1: switch (uap->cmd) { case SVR4_F_DUP2FD: { struct dup2_args du; du.from = uap->fd; du.to = (int)uap->arg; error = dup2(td, &du); if (error) return error; *retval = du.to; return 0; } case SVR4_F_FREESP: { struct svr4_flock ifl; struct flock fl; error = copyin(uap->arg, &ifl, sizeof ifl); if (error) return error; svr4_to_bsd_flock(&ifl, &fl); return fd_truncate(td, uap->fd, &fl); } case SVR4_F_GETLK64: case SVR4_F_SETLK64: case SVR4_F_SETLKW64: { struct svr4_flock64 ifl; struct flock fl; switch (uap->cmd) { case SVR4_F_GETLK64: cmd = F_GETLK; break; case SVR4_F_SETLK64: cmd = F_SETLK; break; case SVR4_F_SETLKW64: cmd = F_SETLKW; break; } error = copyin(uap->arg, &ifl, sizeof (ifl)); if (error) return (error); svr4_to_bsd_flock64(&ifl, &fl); error = kern_fcntl(td, uap->fd, cmd, (intptr_t)&fl); if (error || cmd != F_GETLK) return (error); bsd_to_svr4_flock64(&fl, &ifl); return (copyout(&ifl, uap->arg, sizeof (ifl))); } case SVR4_F_FREESP64: { struct svr4_flock64 ifl; struct flock fl; error = copyin(uap->arg, &ifl, sizeof ifl); if (error) return error; svr4_to_bsd_flock64(&ifl, &fl); return fd_truncate(td, uap->fd, &fl); } case SVR4_F_REVOKE: return fd_revoke(td, uap->fd); default: return ENOSYS; } default: return ENOSYS; } } Index: head/sys/compat/svr4/svr4_misc.c =================================================================== --- head/sys/compat/svr4/svr4_misc.c (revision 175201) +++ head/sys/compat/svr4/svr4_misc.c (revision 175202) @@ -1,1635 +1,1635 @@ /*- * Copyright (c) 1998 Mark Newton * Copyright (c) 1994 Christos Zoulas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * SVR4 compatibility module. * * SVR4 system calls that are implemented differently in BSD are * handled here. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) #include #include #endif #if defined(NetBSD) # if defined(UVM) # include # endif #endif #define BSD_DIRENT(cp) ((struct dirent *)(cp)) static int svr4_mknod(struct thread *, register_t *, char *, svr4_mode_t, svr4_dev_t); static __inline clock_t timeval_to_clock_t(struct timeval *); static int svr4_setinfo (pid_t , struct rusage *, int, svr4_siginfo_t *); struct svr4_hrtcntl_args; static int svr4_hrtcntl (struct thread *, struct svr4_hrtcntl_args *, register_t *); static void bsd_statfs_to_svr4_statvfs(const struct statfs *, struct svr4_statvfs *); static void bsd_statfs_to_svr4_statvfs64(const struct statfs *, struct svr4_statvfs64 *); static struct proc *svr4_pfind(pid_t pid); /* BOGUS noop */ #if defined(BOGUS) int svr4_sys_setitimer(td, uap) register struct thread *td; struct svr4_sys_setitimer_args *uap; { td->td_retval[0] = 0; return 0; } #endif int svr4_sys_wait(td, uap) struct thread *td; struct svr4_sys_wait_args *uap; { int error, st, sig; error = kern_wait(td, WAIT_ANY, &st, 0, NULL); if (error) return (error); if (WIFSIGNALED(st)) { sig = WTERMSIG(st); if (sig >= 0 && sig < NSIG) st = (st & ~0177) | SVR4_BSD2SVR4_SIG(sig); } else if (WIFSTOPPED(st)) { sig = WSTOPSIG(st); if (sig >= 0 && sig < NSIG) st = (st & ~0xff00) | (SVR4_BSD2SVR4_SIG(sig) << 8); } /* * It looks like wait(2) on svr4/solaris/2.4 returns * the status in retval[1], and the pid on retval[0]. */ td->td_retval[1] = st; if (uap->status) error = copyout(&st, uap->status, sizeof(st)); return (error); } int svr4_sys_execv(td, uap) struct thread *td; struct svr4_sys_execv_args *uap; { struct image_args eargs; char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL); free(path, M_TEMP); if (error == 0) error = kern_execve(td, &eargs, NULL); return (error); } int svr4_sys_execve(td, uap) struct thread *td; struct svr4_sys_execve_args *uap; { struct image_args eargs; char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, uap->envp); free(path, M_TEMP); if (error == 0) error = kern_execve(td, &eargs, NULL); return (error); } int svr4_sys_time(td, v) struct thread *td; struct svr4_sys_time_args *v; { struct svr4_sys_time_args *uap = v; int error = 0; struct timeval tv; microtime(&tv); if (uap->t) error = copyout(&tv.tv_sec, uap->t, sizeof(*(uap->t))); td->td_retval[0] = (int) tv.tv_sec; return error; } /* * Read SVR4-style directory entries. We suck them into kernel space so * that they can be massaged before being copied out to user code. * * This code is ported from the Linux emulator: Changes to the VFS interface * between FreeBSD and NetBSD have made it simpler to port it from there than * to adapt the NetBSD version. */ int svr4_sys_getdents64(td, uap) struct thread *td; struct svr4_sys_getdents64_args *uap; { register struct dirent *bdp; struct vnode *vp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* SVR4-format */ int resid, svr4reclen=0; /* SVR4-format */ struct file *fp; struct uio auio; struct iovec aiov; off_t off; struct svr4_dirent64 svr4_dirent; int buflen, error, eofflag, nbytes, justone, vfslocked; u_long *cookies = NULL, *cookiep; int ncookies; DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n", uap->fd, uap->nbytes)); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) { return (error); } if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } nbytes = uap->nbytes; if (nbytes == 1) { nbytes = sizeof (struct svr4_dirent64); justone = 1; } else justone = 0; off = fp->f_offset; #define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */ buflen = max(DIRBLKSIZ, nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) goto out; #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies); if (error) { goto out; } inp = buf; outp = (caddr_t) uap->dp; resid = nbytes; if ((len = buflen - auio.uio_resid) <= 0) { goto eof; } cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { bdp = (struct dirent *) inp; len -= bdp->d_reclen; inp += bdp->d_reclen; cookiep++; ncookies--; } } while (len > 0) { if (cookiep && ncookies == 0) break; bdp = (struct dirent *) inp; reclen = bdp->d_reclen; if (reclen & 3) { DPRINTF(("svr4_readdir: reclen=%d\n", reclen)); error = EFAULT; goto out; } if (bdp->d_fileno == 0) { inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; len -= reclen; continue; } svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen); if (reclen > len || resid < svr4reclen) { outp++; break; } svr4_dirent.d_ino = (long) bdp->d_fileno; if (justone) { /* * old svr4-style readdir usage. */ svr4_dirent.d_off = (svr4_off_t) svr4reclen; svr4_dirent.d_reclen = (u_short) bdp->d_namlen; } else { svr4_dirent.d_off = (svr4_off_t)(off + reclen); svr4_dirent.d_reclen = (u_short) svr4reclen; } strcpy(svr4_dirent.d_name, bdp->d_name); if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen))) goto out; inp += reclen; if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; outp += svr4reclen; resid -= svr4reclen; len -= reclen; if (justone) break; } if (outp == (caddr_t) uap->dp) goto again; fp->f_offset = off; if (justone) nbytes = resid + svr4reclen; eof: td->td_retval[0] = nbytes - resid; out: VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); if (cookies) free(cookies, M_TEMP); free(buf, M_TEMP); return error; } int svr4_sys_getdents(td, uap) struct thread *td; struct svr4_sys_getdents_args *uap; { struct dirent *bdp; struct vnode *vp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* SVR4-format */ int resid, svr4_reclen; /* SVR4-format */ struct file *fp; struct uio auio; struct iovec aiov; struct svr4_dirent idb; off_t off; /* true file offset */ int buflen, error, eofflag, vfslocked; u_long *cookiebuf = NULL, *cookie; int ncookies = 0, *retval = td->td_retval; if (uap->nbytes < 0) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } buflen = min(MAXBSIZE, uap->nbytes); buf = malloc(buflen, M_TEMP, M_WAITOK); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); off = fp->f_offset; again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) goto out; #endif /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookiebuf); if (error) { goto out; } inp = buf; outp = uap->buf; resid = uap->nbytes; if ((len = buflen - auio.uio_resid) == 0) goto eof; for (cookie = cookiebuf; len > 0; len -= reclen) { bdp = (struct dirent *)inp; reclen = bdp->d_reclen; if (reclen & 3) panic("svr4_sys_getdents64: bad reclen"); off = *cookie++; /* each entry points to the next */ if ((off >> 32) != 0) { uprintf("svr4_sys_getdents64: dir offset too large for emulated program"); error = EINVAL; goto out; } if (bdp->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ continue; } svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen); if (reclen > len || resid < svr4_reclen) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make a SVR4-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). */ idb.d_ino = (svr4_ino_t)bdp->d_fileno; idb.d_off = (svr4_off_t)off; idb.d_reclen = (u_short)svr4_reclen; strcpy(idb.d_name, bdp->d_name); if ((error = copyout((caddr_t)&idb, outp, svr4_reclen))) goto out; /* advance past this real entry */ inp += reclen; /* advance output past SVR4-shaped entry */ outp += svr4_reclen; resid -= svr4_reclen; } /* if we squished out the whole block, try again */ if (outp == uap->buf) goto again; fp->f_offset = off; /* update the vnode offset */ eof: *retval = uap->nbytes - resid; out: VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); if (cookiebuf) free(cookiebuf, M_TEMP); free(buf, M_TEMP); return error; } int svr4_sys_mmap(td, uap) struct thread *td; struct svr4_sys_mmap_args *uap; { struct mmap_args mm; int *retval; retval = td->td_retval; #define _MAP_NEW 0x80000000 /* * Verify the arguments. */ if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) return EINVAL; /* XXX still needed? */ if (uap->len == 0) return EINVAL; mm.prot = uap->prot; mm.len = uap->len; mm.flags = uap->flags & ~_MAP_NEW; mm.fd = uap->fd; mm.addr = uap->addr; mm.pos = uap->pos; return mmap(td, &mm); } int svr4_sys_mmap64(td, uap) struct thread *td; struct svr4_sys_mmap64_args *uap; { struct mmap_args mm; void *rp; #define _MAP_NEW 0x80000000 /* * Verify the arguments. */ if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) return EINVAL; /* XXX still needed? */ if (uap->len == 0) return EINVAL; mm.prot = uap->prot; mm.len = uap->len; mm.flags = uap->flags & ~_MAP_NEW; mm.fd = uap->fd; mm.addr = uap->addr; mm.pos = uap->pos; rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz)); if ((mm.flags & MAP_FIXED) == 0 && mm.addr != 0 && (void *)mm.addr < rp) mm.addr = rp; return mmap(td, &mm); } int svr4_sys_fchroot(td, uap) struct thread *td; struct svr4_sys_fchroot_args *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp; struct file *fp; int error, vfslocked; if ((error = priv_check(td, PRIV_VFS_FCHROOT)) != 0) return error; if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return error; vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = change_dir(vp, td); if (error) goto fail; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, vp); if (error) goto fail; #endif VOP_UNLOCK(vp, 0, td); error = change_root(vp, td); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); fail: vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } static int svr4_mknod(td, retval, path, mode, dev) struct thread *td; register_t *retval; char *path; svr4_mode_t mode; svr4_dev_t dev; { char *newpath; int error; CHECKALTEXIST(td, path, &newpath); if (S_ISFIFO(mode)) error = kern_mkfifo(td, newpath, UIO_SYSSPACE, mode); else error = kern_mknod(td, newpath, UIO_SYSSPACE, mode, dev); free(newpath, M_TEMP); return (error); } int svr4_sys_mknod(td, uap) register struct thread *td; struct svr4_sys_mknod_args *uap; { int *retval = td->td_retval; return svr4_mknod(td, retval, uap->path, uap->mode, (svr4_dev_t)svr4_to_bsd_odev_t(uap->dev)); } int svr4_sys_xmknod(td, uap) struct thread *td; struct svr4_sys_xmknod_args *uap; { int *retval = td->td_retval; return svr4_mknod(td, retval, uap->path, uap->mode, (svr4_dev_t)svr4_to_bsd_dev_t(uap->dev)); } int svr4_sys_vhangup(td, uap) struct thread *td; struct svr4_sys_vhangup_args *uap; { return 0; } int svr4_sys_sysconfig(td, uap) struct thread *td; struct svr4_sys_sysconfig_args *uap; { int *retval; retval = &(td->td_retval[0]); switch (uap->name) { case SVR4_CONFIG_UNUSED: *retval = 0; break; case SVR4_CONFIG_NGROUPS: *retval = NGROUPS_MAX; break; case SVR4_CONFIG_CHILD_MAX: *retval = maxproc; break; case SVR4_CONFIG_OPEN_FILES: *retval = maxfiles; break; case SVR4_CONFIG_POSIX_VER: *retval = 198808; break; case SVR4_CONFIG_PAGESIZE: *retval = PAGE_SIZE; break; case SVR4_CONFIG_CLK_TCK: *retval = 60; /* should this be `hz', ie. 100? */ break; case SVR4_CONFIG_XOPEN_VER: *retval = 2; /* XXX: What should that be? */ break; case SVR4_CONFIG_PROF_TCK: *retval = 60; /* XXX: What should that be? */ break; case SVR4_CONFIG_NPROC_CONF: *retval = 1; /* Only one processor for now */ break; case SVR4_CONFIG_NPROC_ONLN: *retval = 1; /* And it better be online */ break; case SVR4_CONFIG_AIO_LISTIO_MAX: case SVR4_CONFIG_AIO_MAX: case SVR4_CONFIG_AIO_PRIO_DELTA_MAX: *retval = 0; /* No aio support */ break; case SVR4_CONFIG_DELAYTIMER_MAX: *retval = 0; /* No delaytimer support */ break; case SVR4_CONFIG_MQ_OPEN_MAX: *retval = msginfo.msgmni; break; case SVR4_CONFIG_MQ_PRIO_MAX: *retval = 0; /* XXX: Don't know */ break; case SVR4_CONFIG_RTSIG_MAX: *retval = 0; break; case SVR4_CONFIG_SEM_NSEMS_MAX: *retval = seminfo.semmni; break; case SVR4_CONFIG_SEM_VALUE_MAX: *retval = seminfo.semvmx; break; case SVR4_CONFIG_SIGQUEUE_MAX: *retval = 0; /* XXX: Don't know */ break; case SVR4_CONFIG_SIGRT_MIN: case SVR4_CONFIG_SIGRT_MAX: *retval = 0; /* No real time signals */ break; case SVR4_CONFIG_TIMER_MAX: *retval = 3; /* XXX: real, virtual, profiling */ break; #if defined(NOTYET) case SVR4_CONFIG_PHYS_PAGES: #if defined(UVM) *retval = uvmexp.free; /* XXX: free instead of total */ #else *retval = cnt.v_free_count; /* XXX: free instead of total */ #endif break; case SVR4_CONFIG_AVPHYS_PAGES: #if defined(UVM) *retval = uvmexp.active; /* XXX: active instead of avg */ #else *retval = cnt.v_active_count; /* XXX: active instead of avg */ #endif break; #endif /* NOTYET */ default: return EINVAL; } return 0; } /* ARGSUSED */ int svr4_sys_break(td, uap) struct thread *td; struct svr4_sys_break_args *uap; { struct obreak_args ap; ap.nsize = uap->nsize; return (obreak(td, &ap)); } static __inline clock_t timeval_to_clock_t(tv) struct timeval *tv; { return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz); } int svr4_sys_times(td, uap) struct thread *td; struct svr4_sys_times_args *uap; { struct timeval tv, utime, stime, cutime, cstime; struct tms tms; struct proc *p; int error; p = td->td_proc; PROC_LOCK(p); PROC_SLOCK(p); calcru(p, &utime, &stime); PROC_SUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = timeval_to_clock_t(&utime); tms.tms_stime = timeval_to_clock_t(&stime); tms.tms_cutime = timeval_to_clock_t(&cutime); tms.tms_cstime = timeval_to_clock_t(&cstime); error = copyout(&tms, uap->tp, sizeof(tms)); if (error) return (error); microtime(&tv); td->td_retval[0] = (int)timeval_to_clock_t(&tv); return (0); } int svr4_sys_ulimit(td, uap) struct thread *td; struct svr4_sys_ulimit_args *uap; { int *retval = td->td_retval; int error; switch (uap->cmd) { case SVR4_GFILLIM: PROC_LOCK(td->td_proc); *retval = lim_cur(td->td_proc, RLIMIT_FSIZE) / 512; PROC_UNLOCK(td->td_proc); if (*retval == -1) *retval = 0x7fffffff; return 0; case SVR4_SFILLIM: { struct rlimit krl; krl.rlim_cur = uap->newlimit * 512; PROC_LOCK(td->td_proc); krl.rlim_max = lim_max(td->td_proc, RLIMIT_FSIZE); PROC_UNLOCK(td->td_proc); error = kern_setrlimit(td, RLIMIT_FSIZE, &krl); if (error) return error; PROC_LOCK(td->td_proc); *retval = lim_cur(td->td_proc, RLIMIT_FSIZE); PROC_UNLOCK(td->td_proc); if (*retval == -1) *retval = 0x7fffffff; return 0; } case SVR4_GMEMLIM: { struct vmspace *vm = td->td_proc->p_vmspace; register_t r; PROC_LOCK(td->td_proc); r = lim_cur(td->td_proc, RLIMIT_DATA); PROC_UNLOCK(td->td_proc); if (r == -1) r = 0x7fffffff; mtx_lock(&Giant); /* XXX */ r += (long) vm->vm_daddr; mtx_unlock(&Giant); if (r < 0) r = 0x7fffffff; *retval = r; return 0; } case SVR4_GDESLIM: PROC_LOCK(td->td_proc); *retval = lim_cur(td->td_proc, RLIMIT_NOFILE); PROC_UNLOCK(td->td_proc); if (*retval == -1) *retval = 0x7fffffff; return 0; default: return EINVAL; } } static struct proc * svr4_pfind(pid) pid_t pid; { struct proc *p; /* look in the live processes */ if ((p = pfind(pid)) == NULL) /* look in the zombies */ p = zpfind(pid); return p; } int svr4_sys_pgrpsys(td, uap) struct thread *td; struct svr4_sys_pgrpsys_args *uap; { int *retval = td->td_retval; struct proc *p = td->td_proc; switch (uap->cmd) { case 1: /* setpgrp() */ /* * SVR4 setpgrp() (which takes no arguments) has the * semantics that the session ID is also created anew, so * in almost every sense, setpgrp() is identical to * setsid() for SVR4. (Under BSD, the difference is that * a setpgid(0,0) will not create a new session.) */ setsid(td, NULL); /*FALLTHROUGH*/ case 0: /* getpgrp() */ PROC_LOCK(p); *retval = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; case 2: /* getsid(pid) */ if (uap->pid == 0) PROC_LOCK(p); else if ((p = svr4_pfind(uap->pid)) == NULL) return ESRCH; /* * This has already been initialized to the pid of * the session leader. */ *retval = (register_t) p->p_session->s_sid; PROC_UNLOCK(p); return 0; case 3: /* setsid() */ return setsid(td, NULL); case 4: /* getpgid(pid) */ if (uap->pid == 0) PROC_LOCK(p); else if ((p = svr4_pfind(uap->pid)) == NULL) return ESRCH; *retval = (int) p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; case 5: /* setpgid(pid, pgid); */ { struct setpgid_args sa; sa.pid = uap->pid; sa.pgid = uap->pgid; return setpgid(td, &sa); } default: return EINVAL; } } struct svr4_hrtcntl_args { int cmd; int fun; int clk; svr4_hrt_interval_t * iv; svr4_hrt_time_t * ti; }; static int svr4_hrtcntl(td, uap, retval) struct thread *td; struct svr4_hrtcntl_args *uap; register_t *retval; { switch (uap->fun) { case SVR4_HRT_CNTL_RES: DPRINTF(("htrcntl(RES)\n")); *retval = SVR4_HRT_USEC; return 0; case SVR4_HRT_CNTL_TOFD: DPRINTF(("htrcntl(TOFD)\n")); { struct timeval tv; svr4_hrt_time_t t; if (uap->clk != SVR4_HRT_CLK_STD) { DPRINTF(("clk == %d\n", uap->clk)); return EINVAL; } if (uap->ti == NULL) { DPRINTF(("ti NULL\n")); return EINVAL; } microtime(&tv); t.h_sec = tv.tv_sec; t.h_rem = tv.tv_usec; t.h_res = SVR4_HRT_USEC; return copyout(&t, uap->ti, sizeof(t)); } case SVR4_HRT_CNTL_START: DPRINTF(("htrcntl(START)\n")); return ENOSYS; case SVR4_HRT_CNTL_GET: DPRINTF(("htrcntl(GET)\n")); return ENOSYS; default: DPRINTF(("Bad htrcntl command %d\n", uap->fun)); return ENOSYS; } } int svr4_sys_hrtsys(td, uap) struct thread *td; struct svr4_sys_hrtsys_args *uap; { int *retval = td->td_retval; switch (uap->cmd) { case SVR4_HRT_CNTL: return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap, retval); case SVR4_HRT_ALRM: DPRINTF(("hrtalarm\n")); return ENOSYS; case SVR4_HRT_SLP: DPRINTF(("hrtsleep\n")); return ENOSYS; case SVR4_HRT_CAN: DPRINTF(("hrtcancel\n")); return ENOSYS; default: DPRINTF(("Bad hrtsys command %d\n", uap->cmd)); return EINVAL; } } static int svr4_setinfo(pid, ru, st, s) pid_t pid; struct rusage *ru; int st; svr4_siginfo_t *s; { svr4_siginfo_t i; int sig; memset(&i, 0, sizeof(i)); i.svr4_si_signo = SVR4_SIGCHLD; i.svr4_si_errno = 0; /* XXX? */ i.svr4_si_pid = pid; if (ru) { i.svr4_si_stime = ru->ru_stime.tv_sec; i.svr4_si_utime = ru->ru_utime.tv_sec; } if (WIFEXITED(st)) { i.svr4_si_status = WEXITSTATUS(st); i.svr4_si_code = SVR4_CLD_EXITED; } else if (WIFSTOPPED(st)) { sig = WSTOPSIG(st); if (sig >= 0 && sig < NSIG) i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig); if (i.svr4_si_status == SVR4_SIGCONT) i.svr4_si_code = SVR4_CLD_CONTINUED; else i.svr4_si_code = SVR4_CLD_STOPPED; } else { sig = WTERMSIG(st); if (sig >= 0 && sig < NSIG) i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig); if (WCOREDUMP(st)) i.svr4_si_code = SVR4_CLD_DUMPED; else i.svr4_si_code = SVR4_CLD_KILLED; } DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n", i.svr4_si_pid, i.svr4_si_signo, i.svr4_si_code, i.svr4_si_errno, i.svr4_si_status)); return copyout(&i, s, sizeof(i)); } int svr4_sys_waitsys(td, uap) struct thread *td; struct svr4_sys_waitsys_args *uap; { struct rusage ru; pid_t pid; int nfound, status; int error, *retval = td->td_retval; struct proc *p, *q; DPRINTF(("waitsys(%d, %d, %p, %x)\n", uap->grp, uap->id, uap->info, uap->options)); q = td->td_proc; switch (uap->grp) { case SVR4_P_PID: pid = uap->id; break; case SVR4_P_PGID: PROC_LOCK(q); pid = -q->p_pgid; PROC_UNLOCK(q); break; case SVR4_P_ALL: pid = WAIT_ANY; break; default: return EINVAL; } /* Hand off the easy cases to kern_wait(). */ if (!(uap->options & (SVR4_WNOWAIT)) && (uap->options & (SVR4_WEXITED | SVR4_WTRAPPED))) { int options; options = 0; if (uap->options & SVR4_WSTOPPED) options |= WUNTRACED; if (uap->options & SVR4_WCONTINUED) options |= WCONTINUED; if (uap->options & SVR4_WNOHANG) options |= WNOHANG; error = kern_wait(td, pid, &status, options, &ru); if (error) return (error); if (uap->options & SVR4_WNOHANG && *retval == 0) error = svr4_setinfo(*retval, NULL, 0, uap->info); else error = svr4_setinfo(*retval, &ru, status, uap->info); *retval = 0; return (error); } /* * Ok, handle the weird cases. Either WNOWAIT is set (meaning we * just want to see if there is a process to harvest, we dont' * want to actually harvest it), or WEXIT and WTRAPPED are clear * meaning we want to ignore zombies. Either way, we don't have * to handle harvesting zombies here. We do have to duplicate the * other portions of kern_wait() though, especially for the * WCONTINUED and WSTOPPED. */ loop: nfound = 0; sx_slock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { PROC_LOCK(p); if (pid != WAIT_ANY && p->p_pid != pid && p->p_pgid != -pid) { PROC_UNLOCK(p); DPRINTF(("pid %d pgid %d != %d\n", p->p_pid, p->p_pgid, pid)); continue; } if (p_canwait(td, p)) { PROC_UNLOCK(p); continue; } nfound++; PROC_SLOCK(p); /* * See if we have a zombie. If so, WNOWAIT should be set, * as otherwise we should have called kern_wait() up above. */ if ((p->p_state == PRS_ZOMBIE) && ((uap->options & (SVR4_WEXITED|SVR4_WTRAPPED)))) { PROC_SUNLOCK(p); KASSERT(uap->options & SVR4_WNOWAIT, ("WNOWAIT is clear")); /* Found a zombie, so cache info in local variables. */ pid = p->p_pid; status = p->p_xstat; ru = p->p_ru; PROC_SLOCK(p); calcru(p, &ru.ru_utime, &ru.ru_stime); PROC_SUNLOCK(p); PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* Copy the info out to userland. */ *retval = 0; DPRINTF(("found %d\n", pid)); return (svr4_setinfo(pid, &ru, status, uap->info)); } /* * See if we have a stopped or continued process. * XXX: This duplicates the same code in kern_wait(). */ if ((p->p_flag & P_STOPPED_SIG) && (p->p_suspcount == p->p_numthreads) && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) { PROC_SUNLOCK(p); if (((uap->options & SVR4_WNOWAIT)) == 0) p->p_flag |= P_WAITED; sx_sunlock(&proctree_lock); pid = p->p_pid; status = W_STOPCODE(p->p_xstat); ru = p->p_ru; PROC_SLOCK(p); calcru(p, &ru.ru_utime, &ru.ru_stime); PROC_SUNLOCK(p); PROC_UNLOCK(p); if (((uap->options & SVR4_WNOWAIT)) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } *retval = 0; DPRINTF(("jobcontrol %d\n", pid)); return (svr4_setinfo(pid, &ru, status, uap->info)); } PROC_SUNLOCK(p); if (uap->options & SVR4_WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_sunlock(&proctree_lock); if (((uap->options & SVR4_WNOWAIT)) == 0) p->p_flag &= ~P_CONTINUED; pid = p->p_pid; ru = p->p_ru; status = SIGCONT; PROC_SLOCK(p); calcru(p, &ru.ru_utime, &ru.ru_stime); PROC_SUNLOCK(p); PROC_UNLOCK(p); if (((uap->options & SVR4_WNOWAIT)) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } *retval = 0; DPRINTF(("jobcontrol %d\n", pid)); return (svr4_setinfo(pid, &ru, status, uap->info)); } PROC_UNLOCK(p); } if (nfound == 0) { sx_sunlock(&proctree_lock); return (ECHILD); } if (uap->options & SVR4_WNOHANG) { sx_sunlock(&proctree_lock); *retval = 0; return (svr4_setinfo(0, NULL, 0, uap->info)); } PROC_LOCK(q); sx_sunlock(&proctree_lock); if (q->p_flag & P_STATCHILD) { q->p_flag &= ~P_STATCHILD; error = 0; } else error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "svr4_wait", 0); PROC_UNLOCK(q); if (error) return error; goto loop; } static void bsd_statfs_to_svr4_statvfs(bfs, sfs) const struct statfs *bfs; struct svr4_statvfs *sfs; { sfs->f_bsize = bfs->f_iosize; /* XXX */ sfs->f_frsize = bfs->f_bsize; sfs->f_blocks = bfs->f_blocks; sfs->f_bfree = bfs->f_bfree; sfs->f_bavail = bfs->f_bavail; sfs->f_files = bfs->f_files; sfs->f_ffree = bfs->f_ffree; sfs->f_favail = bfs->f_ffree; sfs->f_fsid = bfs->f_fsid.val[0]; memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype)); sfs->f_flag = 0; if (bfs->f_flags & MNT_RDONLY) sfs->f_flag |= SVR4_ST_RDONLY; if (bfs->f_flags & MNT_NOSUID) sfs->f_flag |= SVR4_ST_NOSUID; sfs->f_namemax = MAXNAMLEN; memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */ memset(sfs->f_filler, 0, sizeof(sfs->f_filler)); } static void bsd_statfs_to_svr4_statvfs64(bfs, sfs) const struct statfs *bfs; struct svr4_statvfs64 *sfs; { sfs->f_bsize = bfs->f_iosize; /* XXX */ sfs->f_frsize = bfs->f_bsize; sfs->f_blocks = bfs->f_blocks; sfs->f_bfree = bfs->f_bfree; sfs->f_bavail = bfs->f_bavail; sfs->f_files = bfs->f_files; sfs->f_ffree = bfs->f_ffree; sfs->f_favail = bfs->f_ffree; sfs->f_fsid = bfs->f_fsid.val[0]; memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype)); sfs->f_flag = 0; if (bfs->f_flags & MNT_RDONLY) sfs->f_flag |= SVR4_ST_RDONLY; if (bfs->f_flags & MNT_NOSUID) sfs->f_flag |= SVR4_ST_NOSUID; sfs->f_namemax = MAXNAMLEN; memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */ memset(sfs->f_filler, 0, sizeof(sfs->f_filler)); } int svr4_sys_statvfs(td, uap) struct thread *td; struct svr4_sys_statvfs_args *uap; { struct svr4_statvfs sfs; struct statfs bfs; char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_statfs(td, path, UIO_SYSSPACE, &bfs); free(path, M_TEMP); if (error) return (error); bsd_statfs_to_svr4_statvfs(&bfs, &sfs); return copyout(&sfs, uap->fs, sizeof(sfs)); } int svr4_sys_fstatvfs(td, uap) struct thread *td; struct svr4_sys_fstatvfs_args *uap; { struct svr4_statvfs sfs; struct statfs bfs; int error; error = kern_fstatfs(td, uap->fd, &bfs); if (error) return (error); bsd_statfs_to_svr4_statvfs(&bfs, &sfs); return copyout(&sfs, uap->fs, sizeof(sfs)); } int svr4_sys_statvfs64(td, uap) struct thread *td; struct svr4_sys_statvfs64_args *uap; { struct svr4_statvfs64 sfs; struct statfs bfs; char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_statfs(td, path, UIO_SYSSPACE, &bfs); free(path, M_TEMP); if (error) return (error); bsd_statfs_to_svr4_statvfs64(&bfs, &sfs); return copyout(&sfs, uap->fs, sizeof(sfs)); } int svr4_sys_fstatvfs64(td, uap) struct thread *td; struct svr4_sys_fstatvfs64_args *uap; { struct svr4_statvfs64 sfs; struct statfs bfs; int error; error = kern_fstatfs(td, uap->fd, &bfs); if (error) return (error); bsd_statfs_to_svr4_statvfs64(&bfs, &sfs); return copyout(&sfs, uap->fs, sizeof(sfs)); } int svr4_sys_alarm(td, uap) struct thread *td; struct svr4_sys_alarm_args *uap; { struct itimerval itv, oitv; int error; timevalclear(&itv.it_interval); itv.it_value.tv_sec = uap->sec; itv.it_value.tv_usec = 0; error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv); if (error) return (error); if (oitv.it_value.tv_usec != 0) oitv.it_value.tv_sec++; td->td_retval[0] = oitv.it_value.tv_sec; return (0); } int svr4_sys_gettimeofday(td, uap) struct thread *td; struct svr4_sys_gettimeofday_args *uap; { if (uap->tp) { struct timeval atv; microtime(&atv); return copyout(&atv, uap->tp, sizeof (atv)); } return 0; } int svr4_sys_facl(td, uap) struct thread *td; struct svr4_sys_facl_args *uap; { int *retval; retval = td->td_retval; *retval = 0; switch (uap->cmd) { case SVR4_SYS_SETACL: /* We don't support acls on any filesystem */ return ENOSYS; case SVR4_SYS_GETACL: return copyout(retval, &uap->num, sizeof(uap->num)); case SVR4_SYS_GETACLCNT: return 0; default: return EINVAL; } } int svr4_sys_acl(td, uap) struct thread *td; struct svr4_sys_acl_args *uap; { /* XXX: for now the same */ return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap); } int svr4_sys_auditsys(td, uap) struct thread *td; struct svr4_sys_auditsys_args *uap; { /* * XXX: Big brother is *not* watching. */ return 0; } int svr4_sys_memcntl(td, uap) struct thread *td; struct svr4_sys_memcntl_args *uap; { switch (uap->cmd) { case SVR4_MC_SYNC: { struct msync_args msa; msa.addr = uap->addr; msa.len = uap->len; msa.flags = (int)uap->arg; return msync(td, &msa); } case SVR4_MC_ADVISE: { struct madvise_args maa; maa.addr = uap->addr; maa.len = uap->len; maa.behav = (int)uap->arg; return madvise(td, &maa); } case SVR4_MC_LOCK: case SVR4_MC_UNLOCK: case SVR4_MC_LOCKAS: case SVR4_MC_UNLOCKAS: return EOPNOTSUPP; default: return ENOSYS; } } int svr4_sys_nice(td, uap) struct thread *td; struct svr4_sys_nice_args *uap; { struct setpriority_args ap; int error; ap.which = PRIO_PROCESS; ap.who = 0; ap.prio = uap->prio; if ((error = setpriority(td, &ap)) != 0) return error; /* the cast is stupid, but the structures are the same */ if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0) return error; return 0; } int svr4_sys_resolvepath(td, uap) struct thread *td; struct svr4_sys_resolvepath_args *uap; { struct nameidata nd; int error, *retval = td->td_retval; unsigned int ncopy; int vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME | MPSAFE, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return error; vfslocked = NDHASGIANT(&nd); ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1); if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0) goto bad; *retval = ncopy; bad: NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return error; } Index: head/sys/contrib/opensolaris/uts/common/fs/gfs.c =================================================================== --- head/sys/contrib/opensolaris/uts/common/fs/gfs.c (revision 175201) +++ head/sys/contrib/opensolaris/uts/common/fs/gfs.c (revision 175202) @@ -1,884 +1,884 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Portions Copyright 2007 Shivakumar GN */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Generic pseudo-filesystem routines. * * There are significant similarities between the implementation of certain file * system entry points across different filesystems. While one could attempt to * "choke up on the bat" and incorporate common functionality into a VOP * preamble or postamble, such an approach is limited in the benefit it can * provide. In this file we instead define a toolkit of routines which can be * called from a filesystem (with in-kernel pseudo-filesystems being the focus * of the exercise) in a more component-like fashion. * * There are three basic classes of routines: * * 1) Lowlevel support routines * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, * without enforcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() * gfs_readdir_emit() * gfs_readdir_emitn() * gfs_readdir_pred() * gfs_readdir_fini() * gfs_lookup_dot() * * 2) Complete GFS management * * These routines take a more active role in management of the * pseudo-filesystem. They handle the relationship between vnode private * data and VFS data, as well as the relationship between vnodes in the * directory hierarchy. * * In order to use these interfaces, the first member of every private * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control * to GFS. * * gfs_file_create() * gfs_dir_create() * gfs_root_create() * * gfs_file_inactive() * gfs_dir_inactive() * gfs_dir_lookup() * gfs_dir_readdir() * * gfs_vop_inactive() * gfs_vop_lookup() * gfs_vop_readdir() * gfs_vop_map() * * 3) Single File pseudo-filesystems * * This routine creates a rooted file to be overlayed ontop of another * file in the physical filespace. * * Note that the parent is NULL (actually the vfs), but there is nothing * technically keeping such a file from utilizing the "Complete GFS * management" set of routines. * * gfs_root_create_file() */ /* * Low level directory routines * * These routines provide some simple abstractions for reading directories. * They are designed to be used by existing pseudo filesystems (namely procfs) * that already have a complicated management infrastructure. */ /* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length * ureclen - the exported file-space record length (1 for non-legacy FSs) * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode * * Returns 0 or a non-zero errno. * * Typical VOP_READDIR usage of gfs_readdir_*: * * if ((error = gfs_readdir_init(...)) != 0) * return (error); * eof = 0; * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { * if (!consumer_entry_at(voffset)) * voffset = consumer_next_entry(voffset); * if (consumer_eof(voffset)) { * eof = 1 * break; * } * if ((error = gfs_readdir_emit(..., voffset, * consumer_ino(voffset), consumer_name(voffset))) != 0) * break; * } * return (gfs_readdir_fini(..., error, eofp, eof)); * * As you can see, a zero result from gfs_readdir_pred() or * gfs_readdir_emit() indicates that processing should continue, * whereas a non-zero result indicates that the loop should terminate. * Most consumers need do nothing more than let gfs_readdir_fini() * determine what the cause of failure was and return the appropriate * value. */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, uio_t *uiop, ino64_t parent, ino64_t self) { if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); st->grd_parent = parent; st->grd_self = self; return (0); } /* * gfs_readdir_emit_int: internal routine to emit directory entry * * st - the current readdir state, which must have d_ino and d_name * set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ static int gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, int *ncookies, u_long **cookies) { int reclen, namlen; namlen = strlen(st->grd_dirent->d_name); reclen = DIRENT64_RECLEN(namlen); if (reclen > uiop->uio_resid) { /* * Error if no entries were returned yet */ if (uiop->uio_resid == st->grd_oresid) return (EINVAL); return (-1); } /* XXX: This can change in the future. */ st->grd_dirent->d_type = DT_DIR; st->grd_dirent->d_reclen = (ushort_t)reclen; st->grd_dirent->d_namlen = namlen; if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); uiop->uio_loffset = next; if (*cookies != NULL) { **cookies = next; (*cookies)++; (*ncookies)--; KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies)); } return (0); } /* * gfs_readdir_emit: emit a directory entry * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, ino64_t ino, const char *name, int *ncookies, u_long **cookies) { offset_t off = (voff + 2) * st->grd_ureclen; st->grd_dirent->d_ino = ino; (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); /* * Inter-entry offsets are invalid, so we assume a record size of * grd_ureclen and explicitly set the offset appropriately. */ return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies, cookies)); } /* * gfs_readdir_pred: readdir loop predicate * voffp - a pointer in which the next virtual offset should be stored * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp, int *ncookies, u_long **cookies) { offset_t off, voff; int error; top: if (uiop->uio_resid <= 0) return (-1); off = uiop->uio_loffset / st->grd_ureclen; voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, ".", ncookies, cookies)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, "..", ncookies, cookies)) == 0) goto top; } else { *voffp = voff; return (0); } return (error); } /* * gfs_readdir_fini: generic readdir cleanup * error - if positive, an error to return * eofp - the eofp passed to readdir * eof - the eof value * * Returns a 0 on success, a non-zero errno on failure. This result * should be returned from readdir. */ int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); if (error > 0) return (error); if (eofp) *eofp = eof; return (0); } /* * gfs_lookup_dot * * Performs a basic check for "." and ".." directory entries. */ int gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) { if (*nm == '\0' || strcmp(nm, ".") == 0) { VN_HOLD(dvp); *vpp = dvp; return (0); } else if (strcmp(nm, "..") == 0) { if (pvp == NULL) { ASSERT(dvp->v_flag & VROOT); VN_HOLD(dvp); *vpp = dvp; } else { VN_HOLD(pvp); *vpp = pvp; } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (0); } return (-1); } /* * gfs_file_create(): create a new GFS file * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * * In order to use this interface, the parent vnode must have been created by * gfs_dir_create(), and the private data stored in v_data must have a * 'gfs_file_t' as its first field. * * Given these constraints, this routine will automatically: * * - Allocate v_data for the vnode * - Initialize necessary fields in the vnode * - Hold the parent */ vnode_t * gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops) { gfs_file_t *fp; vnode_t *vp; int error; /* * Allocate vnode and internal data structure */ fp = kmem_zalloc(size, KM_SLEEP); error = getnewvnode("zfs", vfsp, ops, &vp); ASSERT(error == 0); vp->v_data = (caddr_t)fp; /* * Set up various pointers */ fp->gfs_vnode = vp; fp->gfs_parent = pvp; fp->gfs_size = size; fp->gfs_type = GFS_FILE; error = insmntque(vp, vfsp); KASSERT(error == 0, ("insmntque() failed: error %d", error)); /* * Initialize vnode and hold parent. */ if (pvp) VN_HOLD(pvp); return (vp); } /* * gfs_dir_create: creates a new directory in the parent * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * entries - NULL-terminated list of static entries (if any) * maxlen - maximum length of a directory entry * readdir_cb - readdir callback (see gfs_dir_readdir) * inode_cb - inode callback (see gfs_dir_readdir) * lookup_cb - lookup callback (see gfs_dir_lookup) * * In order to use this function, the first member of the private vnode * structure (v_data) must be a gfs_dir_t. For each directory, there are * static entries, defined when the structure is initialized, and dynamic * entries, retrieved through callbacks. * * If a directory has static entries, then it must supply a inode callback, * which will compute the inode number based on the parent and the index. * For a directory with dynamic entries, the caller must supply a readdir * callback and a lookup callback. If a static lookup fails, we fall back to * the supplied lookup callback, if any. * * This function also performs the same initialization as gfs_file_create(). */ vnode_t * gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; gfs_dir_t *dp; gfs_dirent_t *de; vp = gfs_file_create(struct_size, pvp, vfsp, ops); vp->v_type = VDIR; dp = vp->v_data; dp->gfsd_file.gfs_type = GFS_DIR; dp->gfsd_maxlen = maxlen; if (entries != NULL) { for (de = entries; de->gfse_name != NULL; de++) dp->gfsd_nstatic++; dp->gfsd_static = kmem_alloc( dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); bcopy(entries, dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } dp->gfsd_readdir = readdir_cb; dp->gfsd_lookup = lookup_cb; dp->gfsd_inode = inode_cb; mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); return (vp); } /* * gfs_root_create(): create a root vnode for a GFS filesystem * * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The * only difference is that it takes a vfs_t instead of a vnode_t as its parent. */ vnode_t * gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; VFS_HOLD(vfsp); vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb, maxlen, readdir_cb, lookup_cb); /* Manually set the inode */ ((gfs_file_t *)vp->v_data)->gfs_ino = ino; vp->v_flag |= VROOT; return (vp); } /* * gfs_file_inactive() * * Called from the VOP_INACTIVE() routine. If necessary, this routine will * remove the given vnode from the parent directory and clean up any references * in the VFS layer. * * If the vnode was not removed (due to a race with vget), then NULL is * returned. Otherwise, a pointer to the private data is returned. */ void * gfs_file_inactive(vnode_t *vp) { int i; gfs_dirent_t *ge = NULL; gfs_file_t *fp = vp->v_data; gfs_dir_t *dp = NULL; void *data; if (fp->gfs_parent == NULL) goto found; dp = fp->gfs_parent->v_data; /* * First, see if this vnode is cached in the parent. */ gfs_dir_lock(dp); /* * Find it in the set of static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (ge->gfse_vnode == vp) goto found; } /* * If 'ge' is NULL, then it is a dynamic entry. */ ge = NULL; found: VI_LOCK(vp); ASSERT(vp->v_count < 2); /* * Really remove this vnode */ data = vp->v_data; if (ge != NULL) { /* * If this was a statically cached entry, simply set the * cached vnode to NULL. */ ge->gfse_vnode = NULL; } if (vp->v_count == 1) { vp->v_usecount--; vdropl(vp); } else { VI_UNLOCK(vp); } /* * Free vnode and release parent */ if (fp->gfs_parent) { gfs_dir_unlock(dp); VI_LOCK(fp->gfs_parent); fp->gfs_parent->v_usecount--; VI_UNLOCK(fp->gfs_parent); } else { ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } return (data); } /* * gfs_dir_inactive() * * Same as above, but for directories. */ void * gfs_dir_inactive(vnode_t *vp) { gfs_dir_t *dp; ASSERT(vp->v_type == VDIR); if ((dp = gfs_file_inactive(vp)) != NULL) { mutex_destroy(&dp->gfsd_lock); if (dp->gfsd_nstatic) kmem_free(dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } return (dp); } /* * gfs_dir_lookup() * * Looks up the given name in the directory and returns the corresponding vnode, * if found. * * First, we search statically defined entries, if any. If a match is found, * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the * existing vnode. Otherwise, we call the static entry's callback routine, * caching the result if necessary. * * If no static entry is found, we invoke the lookup callback, if any. The * arguments to this callback are: * * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode * * Returns 0 on success, non-zero on error. */ int gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) { int i; gfs_dirent_t *ge; vnode_t *vp; gfs_dir_t *dp = dvp->v_data; int ret = 0; ASSERT(dvp->v_type == VDIR); if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) return (0); gfs_dir_lock(dp); /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (strcmp(ge->gfse_name, nm) == 0) { if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); goto out; } /* * We drop the directory lock, as the constructor will * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set * for this entry, we discard the result in favor of the * cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); gfs_dir_lock(dp); ((gfs_file_t *)vp->v_data)->gfs_index = i; /* Set the inode according to the callback. */ ((gfs_file_t *)vp->v_data)->gfs_ino = dp->gfsd_inode(dvp, i); if (ge->gfse_flags & GFS_CACHE_VNODE) { if (ge->gfse_vnode == NULL) { ge->gfse_vnode = vp; } else { /* * A parallel constructor beat us to it; * return existing vnode. We have to be * careful because we can't release the * current vnode while holding the * directory lock; its inactive routine * will try to lock this directory. */ vnode_t *oldvp = vp; vp = ge->gfse_vnode; VN_HOLD(vp); gfs_dir_unlock(dp); VN_RELE(oldvp); gfs_dir_lock(dp); } } goto out; } } /* * See if there is a dynamic constructor. */ if (dp->gfsd_lookup) { ino64_t ino; gfs_file_t *fp; /* * Once again, drop the directory lock, as the lookup routine * will need to allocate memory, or otherwise deadlock on this * directory. */ gfs_dir_unlock(dp); ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); gfs_dir_lock(dp); if (ret != 0) goto out; fp = (gfs_file_t *)vp->v_data; fp->gfs_index = -1; fp->gfs_ino = ino; } else { /* * No static entry found, and there is no lookup callback, so * return ENOENT. */ ret = ENOENT; } out: gfs_dir_unlock(dp); if (ret == 0) *vpp = vp; else *vpp = NULL; return (ret); } /* * gfs_dir_readdir: does a readdir() on the given directory * * dvp - directory vnode * uiop - uio structure * eofp - eof pointer * data - arbitrary data passed to readdir callback * * This routine does all the readdir() dirty work. Even so, the caller must * supply two callbacks in order to get full compatibility. * * If the directory contains static entries, an inode callback must be * specified. This avoids having to create every vnode and call VOP_GETATTR() * when reading the directory. This function has the following arguments: * * ino_t gfs_inode_cb(vnode_t *vp, int index); * * vp - vnode for the directory * index - index in original gfs_dirent_t array * * Returns the inode number for the given entry. * * For directories with dynamic entries, a readdir callback must be provided. * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, * offset_t *off, offset_t *nextoff, void *data) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and * d_ino. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left * untouched. * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, u_long **cookies, void *data) { gfs_readdir_state_t gstate; int error, eof = 0; ino64_t ino, pino; offset_t off, next; gfs_dir_t *dp = dvp->v_data; ino = dp->gfsd_file.gfs_ino; if (dp->gfsd_file.gfs_parent == NULL) pino = ino; /* root of filesystem */ else pino = ((gfs_file_t *) (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, pino, ino)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, cookies)) == 0 && !eof) { if (off >= 0 && off < dp->gfsd_nstatic) { ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, off, ino, dp->gfsd_static[off].gfse_name, ncookies, cookies)) != 0) break; } else if (dp->gfsd_readdir) { off -= dp->gfsd_nstatic; if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, data)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; next += dp->gfsd_nstatic + 2; if ((error = gfs_readdir_emit_int(&gstate, uiop, next, ncookies, cookies)) != 0) break; } else { /* * Offset is beyond the end of the static entries, and * we have no dynamic entries. Set EOF. */ eof = 1; } } return (gfs_readdir_fini(&gstate, error, eofp, eof)); } /* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls * gfs_dir_readdir() as necessary. */ /* ARGSUSED */ int gfs_vop_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; } */ *ap; { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; int *eofp = ap->a_eofflag; int ncookies = 0; u_long *cookies = NULL; int error; if (ap->a_ncookies) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); if (error == 0) { /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; } else if (ap->a_ncookies) { free(*ap->a_cookies, M_TEMP); *ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } /* * gfs_vop_inactive: VOP_INACTIVE() entry point * * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. */ /* ARGSUSED */ int gfs_vop_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; gfs_file_t *fp = vp->v_data; void *data; if (fp->gfs_type == GFS_DIR) data = gfs_dir_inactive(vp); else data = gfs_file_inactive(vp); if (data != NULL) kmem_free(data, fp->gfs_size); vp->v_data = NULL; return (0); } Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c =================================================================== --- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c (revision 175201) +++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c (revision 175202) @@ -1,1119 +1,1119 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' directory, but this may expand in the * future. The elements are built using the GFS primitives, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding vnode. * * All mounts are handled automatically by the kernel, but unmounts are * (currently) handled from user land. The main reason is that there is no * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. */ #include #include #include #include #include #include #include #include #include typedef struct { char *se_name; vnode_t *se_root; avl_node_t se_node; } zfs_snapentry_t; static int snapentry_compare(const void *a, const void *b) { const zfs_snapentry_t *sa = a; const zfs_snapentry_t *sb = b; int ret = strcmp(sa->se_name, sb->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } static struct vop_vector zfsctl_ops_root; static struct vop_vector zfsctl_ops_snapdir; static struct vop_vector zfsctl_ops_snapshot; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); typedef struct zfsctl_node { gfs_dir_t zc_gfs_private; uint64_t zc_id; timestruc_t zc_cmtime; /* ctime and mtime, always the same */ } zfsctl_node_t; typedef struct zfsctl_snapdir { zfsctl_node_t sd_node; kmutex_t sd_lock; avl_tree_t sd_snaps; } zfsctl_snapdir_t; /* * Root directory elements. We have only a single static entry, 'snapshot'. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, { NULL } }; /* include . and .. in the calculation */ #define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ sizeof (gfs_dirent_t)) + 1) /* * Initialize the various GFS pieces we'll need to create and manipulate .zfs * directories. This is called from the ZFS init routine, and initializes the * vnode ops vectors that we'll be using. */ void zfsctl_init(void) { } void zfsctl_fini(void) { } /* * Return the inode number associated with the 'snapshot' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { ASSERT(index == 0); return (ZFSCTL_INO_SNAPDIR); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the vfs_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. */ void zfsctl_create(zfsvfs_t *zfsvfs) { vnode_t *vp, *rvp; zfsctl_node_t *zcp; ASSERT(zfsvfs->z_ctldir == NULL); vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = ZFSCTL_INO_ROOT; VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0); ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); VN_URELE(rvp); /* * We're only faking the fact that we have a root of a filesystem for * the sake of the GFS interfaces. Undo the flag manipulation it did * for us. */ vp->v_vflag &= ~VV_ROOT; zfsvfs->z_ctldir = vp; } /* * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. * There might still be more references if we were force unmounted, but only * new zfs_inactive() calls can occur and they don't reference .zfs */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { VN_RELE(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ vnode_t * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); VN_HOLD(zp->z_zfsvfs->z_ctldir); return (zp->z_zfsvfs->z_ctldir); } /* * Common open routine. Disallow any write access. */ /* ARGSUSED */ static int zfsctl_common_open(struct vop_open_args *ap) { int flags = ap->a_mode; if (flags & FWRITE) return (EACCES); return (0); } /* * Common close routine. Nothing to do here. */ /* ARGSUSED */ static int zfsctl_common_close(struct vop_close_args *ap) { return (0); } /* * Common access routine. Disallow writes. */ /* ARGSUSED */ static int zfsctl_common_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { int mode = ap->a_mode; if (mode & VWRITE) return (EACCES); return (0); } /* * Common getattr function. Fill in basic information. */ static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { zfsctl_node_t *zcp = vp->v_data; timestruc_t now; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* * We are a purly virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; vap->va_nblocks = 0; vap->va_seq = 0; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; vap->va_type = VDIR; /* * We live in the now (for atime). */ gethrestime(&now); vap->va_atime = now; vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime; /* FreeBSD: Reset chflags(2) flags. */ vap->va_flags = 0; } static int zfsctl_common_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { vnode_t *vp = ap->a_vp; fid_t *fidp = (void *)ap->a_fid; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_node_t *zcp = vp->v_data; uint64_t object = zcp->zc_id; zfid_short_t *zfid; int i; ZFS_ENTER(zfsvfs); fidp->fid_len = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; ZFS_EXIT(zfsvfs); return (0); } static int zfsctl_common_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); VI_LOCK(vp); vp->v_data = NULL; VI_UNLOCK(vp); return (0); } /* * .zfs inode namespace * * We need to generate unique inode numbers for all files and directories * within the .zfs pseudo-filesystem. We use the following scheme: * * ENTRY ZFSCTL_INODE * .zfs 1 * .zfs/snapshot 2 * .zfs/snapshot/ objectid(snap) */ #define ZFSCTL_INO_SNAP(id) (id) /* * Get root directory attributes. */ /* ARGSUSED */ static int zfsctl_root_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; ZFS_ENTER(zfsvfs); vap->va_nodeid = ZFSCTL_INO_ROOT; vap->va_nlink = vap->va_size = NROOT_ENTRIES; zfsctl_common_getattr(vp, vap); ZFS_EXIT(zfsvfs); return (0); } /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; ZFS_ENTER(zfsvfs); if (strcmp(nm, "..") == 0) { err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread); if (err == 0) VOP_UNLOCK(*vpp, 0, curthread); } else { err = gfs_dir_lookup(dvp, nm, vpp); } ZFS_EXIT(zfsvfs); return (err); } /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_root_lookup_vop(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; cred_t *cr = ap->a_cnp->cn_cred; int flags = ap->a_cnp->cn_flags; int nameiop = ap->a_cnp->cn_nameiop; char nm[NAME_MAX + 1]; int err; if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) return (EOPNOTSUPP); ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr); if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (err); } static struct vop_vector zfsctl_ops_root = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_root_getattr, .vop_access = zfsctl_common_access, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_root_lookup_vop, .vop_inactive = gfs_vop_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, }; static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; dmu_objset_name(os, zname); if (strlen(zname) + 1 + strlen(name) >= len) return (ENAMETOOLONG); (void) strcat(zname, "@"); (void) strcat(zname, name); return (0); } static int zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr) { zfsctl_snapdir_t *sdp = dvp->v_data; zfs_snapentry_t search, *sep; struct vop_inactive_args ap; avl_index_t where; int err; ASSERT(MUTEX_HELD(&sdp->sd_lock)); search.se_name = (char *)name; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) return (ENOENT); ASSERT(vn_ismntpt(sep->se_root)); /* this will be dropped by dounmount() */ if ((err = vn_vfswlock(sep->se_root)) != 0) return (err); err = dounmount(vn_mountedvfs(sep->se_root), force, curthread); if (err) return (err); ASSERT(sep->se_root->v_count == 1); ap.a_vp = sep->se_root; gfs_vop_inactive(&ap); avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); return (0); } #if 0 static void zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) { avl_index_t where; vfs_t *vfsp; refstr_t *pathref; char newpath[MAXNAMELEN]; char *tail; ASSERT(MUTEX_HELD(&sdp->sd_lock)); ASSERT(sep != NULL); vfsp = vn_mountedvfs(sep->se_root); ASSERT(vfsp != NULL); vfs_lock_wait(vfsp); /* * Change the name in the AVL tree. */ avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); avl_insert(&sdp->sd_snaps, sep, where); /* * Change the current mountpoint info: * - update the tail of the mntpoint path * - update the tail of the resource path */ pathref = vfs_getmntpoint(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '/')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setmntpoint(vfsp, newpath); pathref = vfs_getresource(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '@')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setresource(vfsp, newpath); vfs_unlock(vfsp); } #endif #if 0 static int zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) { zfsctl_snapdir_t *sdp = sdvp->v_data; zfs_snapentry_t search, *sep; avl_index_t where; char from[MAXNAMELEN], to[MAXNAMELEN]; int err; err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); if (err) return (err); err = zfs_secpolicy_write(from, cr); if (err) return (err); /* * Cannot move snapshots out of the snapdir. */ if (sdvp != tdvp) return (EINVAL); if (strcmp(snm, tnm) == 0) return (0); err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); if (err) return (err); mutex_enter(&sdp->sd_lock); search.se_name = (char *)snm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { mutex_exit(&sdp->sd_lock); return (ENOENT); } err = dmu_objset_rename(from, to, B_FALSE); if (err == 0) zfsctl_rename_snap(sdp, sep, tnm); mutex_exit(&sdp->sd_lock); return (err); } #endif #if 0 /* ARGSUSED */ static int zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) { zfsctl_snapdir_t *sdp = dvp->v_data; char snapname[MAXNAMELEN]; int err; err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); if (err) return (err); err = zfs_secpolicy_write(snapname, cr); if (err) return (err); mutex_enter(&sdp->sd_lock); err = zfsctl_unmount_snap(dvp, name, 0, cr); if (err) { mutex_exit(&sdp->sd_lock); return (err); } err = dmu_objset_destroy(snapname); mutex_exit(&sdp->sd_lock); return (err); } #endif /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem vnode as necessary. * Perform a mount of the associated dataset on top of the vnode. */ /* ARGSUSED */ int zfsctl_snapdir_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; char nm[NAME_MAX + 1]; zfsctl_snapdir_t *sdp = dvp->v_data; objset_t *snap; char snapname[MAXNAMELEN]; char *mountpoint; zfs_snapentry_t *sep, search; size_t mountpoint_len; avl_index_t where; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); ASSERT(dvp->v_type == VDIR); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) return (0); *vpp = NULL; /* * If we get a recursive call, that means we got called * from the domount() code while it was trying to look up the * spec (which looks like a local path for zfs). We need to * add some flag to domount() to tell it not to do this lookup. */ if (MUTEX_HELD(&sdp->sd_lock)) return (ENOENT); ZFS_ENTER(zfsvfs); mutex_enter(&sdp->sd_lock); search.se_name = (char *)nm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { *vpp = sep->se_root; VN_HOLD(*vpp); if ((*vpp)->v_mountedhere == NULL) { /* * The snapshot was unmounted behind our backs, * try to remount it. */ goto domount; } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (0); } /* * The requested snapshot is not currently mounted, look it up. */ err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); if (err) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (err); } if (dmu_objset_open(snapname, DMU_OST_ZFS, DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (ENOENT); } sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); VN_HOLD(*vpp); avl_insert(&sdp->sd_snaps, sep, where); dmu_objset_close(snap); domount: mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + strlen("/.zfs/snapshot/") + strlen(nm) + 1; mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, nm); err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0); kmem_free(mountpoint, mountpoint_len); /* FreeBSD: This line was moved from below to avoid a lock recursion. */ if (err == 0) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); mutex_exit(&sdp->sd_lock); /* * If we had an error, drop our hold on the vnode and * zfsctl_snapshot_inactive() will clean up. */ if (err) { VN_RELE(*vpp); *vpp = NULL; } return (err); } /* ARGSUSED */ static int zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, offset_t *offp, offset_t *nextp, void *data) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; char snapname[MAXNAMELEN]; uint64_t id, cookie; ZFS_ENTER(zfsvfs); cookie = *offp; if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &cookie) == ENOENT) { *eofp = 1; ZFS_EXIT(zfsvfs); return (0); } (void) strcpy(dp->d_name, snapname); dp->d_ino = ZFSCTL_INO_SNAP(id); *nextp = cookie; ZFS_EXIT(zfsvfs); return (0); } vnode_t * zfsctl_mknode_snapdir(vnode_t *pvp) { vnode_t *vp; zfsctl_snapdir_t *sdp; vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, zfsctl_snapdir_readdir_cb, NULL); sdp = vp->v_data; sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&sdp->sd_snaps, snapentry_compare, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); return (vp); } /* ARGSUSED */ static int zfsctl_snapdir_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_snapdir_t *sdp = vp->v_data; ZFS_ENTER(zfsvfs); zfsctl_common_getattr(vp, vap); vap->va_nodeid = gfs_file_inode(vp); vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfsctl_snapdir_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfsctl_snapdir_t *sdp = vp->v_data; void *private; private = gfs_dir_inactive(vp); if (private != NULL) { ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); mutex_destroy(&sdp->sd_lock); avl_destroy(&sdp->sd_snaps); kmem_free(private, sizeof (zfsctl_snapdir_t)); } return (0); } static struct vop_vector zfsctl_ops_snapdir = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_snapdir_getattr, .vop_access = zfsctl_common_access, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_snapdir_lookup, .vop_inactive = zfsctl_snapdir_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, }; static vnode_t * zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) { vnode_t *vp; zfsctl_node_t *zcp; vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = objset; return (vp); } static int zfsctl_snapshot_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; struct vop_inactive_args iap; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int locked; vnode_t *dvp; VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0); sdp = dvp->v_data; VOP_UNLOCK(dvp, 0, ap->a_td); if (!(locked = MUTEX_HELD(&sdp->sd_lock))) mutex_enter(&sdp->sd_lock); if (vp->v_count > 1) { if (!locked) mutex_exit(&sdp->sd_lock); return (0); } ASSERT(!vn_ismntpt(vp)); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { next = AVL_NEXT(&sdp->sd_snaps, sep); if (sep->se_root == vp) { avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); break; } sep = next; } ASSERT(sep != NULL); if (!locked) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); /* * Dispose of the vnode for the snapshot mount point. * This is safe to do because once this entry has been removed * from the AVL tree, it can't be found again, so cannot become * "active". If we lookup the same name again we will end up * creating a new vnode. */ iap.a_vp = vp; return (gfs_vop_inactive(&iap)); } static int zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td) { VN_HOLD(*vpp); /* Snapshot should be already mounted, but just in case. */ if (vn_mountedvfs(*vpp) == NULL) return (ENOENT); return (traverse(vpp, lktype)); } static void zfsctl_traverse_end(vnode_t *vp, int err) { if (err == 0) vput(vp); else VN_RELE(vp); } static int zfsctl_snapshot_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; int err; err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td); if (err == 0) err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td); zfsctl_traverse_end(vp, err); return (err); } static int zfsctl_snapshot_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { vnode_t *vp = ap->a_vp; int err; err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread); if (err == 0) err = VOP_VPTOFH(vp, (void *)ap->a_fid); zfsctl_traverse_end(vp, err); return (err); } /* * These VP's should never see the light of day. They should always * be covered. */ static struct vop_vector zfsctl_ops_snapshot = { .vop_default = &default_vnodeops, .vop_inactive = zfsctl_snapshot_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_getattr = zfsctl_snapshot_getattr, .vop_fid = zfsctl_snapshot_fid, }; int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp, *vp; zfsctl_snapdir_t *sdp; zfsctl_node_t *zcp; zfs_snapentry_t *sep; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, kcred); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { vp = sep->se_root; zcp = vp->v_data; if (zcp->zc_id == objsetid) break; sep = AVL_NEXT(&sdp->sd_snaps, sep); } if (sep != NULL) { VN_HOLD(vp); error = traverse(&vp, LK_SHARED | LK_RETRY); if (error == 0) { if (vp == sep->se_root) error = EINVAL; else *zfsvfsp = VTOZ(vp)->z_zfsvfs; } mutex_exit(&sdp->sd_lock); if (error == 0) VN_URELE(vp); else VN_RELE(vp); } else { error = EINVAL; mutex_exit(&sdp->sd_lock); } VN_RELE(dvp); return (error); } /* * Unmount any snapshots for the given filesystem. This is called from * zfs_umount() - if we have a ctldir, then go through and unmount all the * snapshots. */ int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { struct vop_inactive_args ap; zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp, *svp; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, cr); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { svp = sep->se_root; next = AVL_NEXT(&sdp->sd_snaps, sep); /* * If this snapshot is not mounted, then it must * have just been unmounted by somebody else, and * will be cleaned up by zfsctl_snapdir_inactive(). */ if (vn_ismntpt(svp)) { if ((error = vn_vfswlock(svp)) != 0) goto out; /* * Increase usecount, so dounmount() won't vrele() it * to 0 and call zfsctl_snapdir_inactive(). */ VN_HOLD(svp); vfsp = vn_mountedvfs(svp); mtx_lock(&Giant); error = dounmount(vfsp, fflags, curthread); mtx_unlock(&Giant); if (error != 0) { VN_RELE(svp); goto out; } avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); /* * We can't use VN_RELE(), as that will try to * invoke zfsctl_snapdir_inactive(), and that * would lead to an attempt to re-grab the sd_lock. */ ASSERT3U(svp->v_count, ==, 1); ap.a_vp = svp; gfs_vop_inactive(&ap); } sep = next; } out: mutex_exit(&sdp->sd_lock); VN_RELE(dvp); return (error); } Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c =================================================================== --- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c (revision 175201) +++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c (revision 175202) @@ -1,430 +1,430 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Functions to replay ZFS intent log (ZIL) records * The functions are called through a function vector (zfs_replay_vector) * which is indexed by the transaction type. */ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { VATTR_NULL(vap); vap->va_mask = (uint_t)mask; vap->va_type = IFTOVT(mode); vap->va_mode = mode & MODEMASK; vap->va_uid = (uid_t)uid; vap->va_gid = (gid_t)gid; vap->va_rdev = zfs_cmpldev(rdev); vap->va_nodeid = nodeid; } /* ARGSUSED */ static int zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) { return (ENOTSUP); } static int zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) { char *name = (char *)(lr + 1); /* name follows lr_create_t */ char *link; /* symlink content follows name */ znode_t *dzp; vnode_t *vp = NULL; vattr_t va; struct componentname cn; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time and generation number. The generic VOP_CREATE() * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime); va.va_nblocks = lr->lr_gen; cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch ((int)lr->lr_common.lrc_txtype) { case TX_CREATE: error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va); break; case TX_MKDIR: error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va); break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &va, &vp, kcred); break; case TX_SYMLINK: link = name + strlen(name) + 1; error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link); break; default: error = ENOTSUP; } VOP_UNLOCK(ZTOV(dzp), 0, curthread); if (error == 0 && vp != NULL) { VOP_UNLOCK(vp, 0, curthread); VN_RELE(vp); } VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) { char *name = (char *)(lr + 1); /* name follows lr_remove_t */ znode_t *dzp; struct componentname cn; vnode_t *vp; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); bzero(&cn, sizeof(cn)); cn.cn_nameptr = name; cn.cn_namelen = strlen(name); cn.cn_nameiop = DELETE; cn.cn_flags = ISLASTCN | SAVENAME; cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cn.cn_cred = kcred; cn.cn_thread = curthread; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); if (error != 0) { VOP_UNLOCK(ZTOV(dzp), 0, curthread); goto fail; } switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: error = VOP_REMOVE(ZTOV(dzp), vp, &cn); break; case TX_RMDIR: error = VOP_RMDIR(ZTOV(dzp), vp, &cn); break; default: error = ENOTSUP; } vput(vp); VOP_UNLOCK(ZTOV(dzp), 0, curthread); fail: VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) { char *name = (char *)(lr + 1); /* name follows lr_link_t */ znode_t *dzp, *zp; struct componentname cn; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { VN_RELE(ZTOV(dzp)); return (error); } cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread); - vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn); VOP_UNLOCK(ZTOV(zp), 0, curthread); VOP_UNLOCK(ZTOV(dzp), 0, curthread); VN_RELE(ZTOV(zp)); VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) { char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; struct componentname scn, tcn; vnode_t *svp, *tvp; kthread_t *td = curthread; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { VN_RELE(ZTOV(sdzp)); return (error); } svp = tvp = NULL; bzero(&scn, sizeof(scn)); scn.cn_nameptr = sname; scn.cn_namelen = strlen(sname); scn.cn_nameiop = DELETE; scn.cn_flags = ISLASTCN | SAVENAME; scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; scn.cn_cred = kcred; scn.cn_thread = td; - vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); VOP_UNLOCK(ZTOV(sdzp), 0, td); if (error != 0) goto fail; VOP_UNLOCK(svp, 0, td); bzero(&tcn, sizeof(tcn)); tcn.cn_nameptr = tname; tcn.cn_namelen = strlen(tname); tcn.cn_nameiop = RENAME; tcn.cn_flags = ISLASTCN | SAVENAME; tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; tcn.cn_cred = kcred; tcn.cn_thread = td; - vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK(ZTOV(tdzp), 0, td); goto fail; } error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn); return (error); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); VN_RELE(ZTOV(tdzp)); VN_RELE(ZTOV(sdzp)); return (error); } static int zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) { char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; ssize_t resid; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); VN_RELE(ZTOV(zp)); return (error); } static int zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) { ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); return (EOPNOTSUPP); } static int zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) { znode_t *zp; vattr_t va; vnode_t *vp; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log setattrs out of order, it's possible the * file has been removed. In this case just drop the setattr * and return success. */ if (error == ENOENT) error = 0; return (error); } zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); va.va_size = lr->lr_size; ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime); ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime); vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, &va, kcred, curthread); VOP_UNLOCK(vp, 0, curthread); VN_RELE(vp); return (error); } static int zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) { ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ #ifdef TODO vsecattr_t vsa; #endif znode_t *zp; int error; if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_aclcnt); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log acls out of order, it's possible the * file has been removed. In this case just drop the acl * and return success. */ if (error == ENOENT) error = 0; return (error); } #ifdef TODO bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentp = ace; error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred); #else error = EOPNOTSUPP; #endif VN_RELE(ZTOV(zp)); return (error); } /* * Callback vectors for replaying records */ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_error, /* 0 no such transaction type */ zfs_replay_create, /* TX_CREATE */ zfs_replay_create, /* TX_MKDIR */ zfs_replay_create, /* TX_MKXATTR */ zfs_replay_create, /* TX_SYMLINK */ zfs_replay_remove, /* TX_REMOVE */ zfs_replay_remove, /* TX_RMDIR */ zfs_replay_link, /* TX_LINK */ zfs_replay_rename, /* TX_RENAME */ zfs_replay_write, /* TX_WRITE */ zfs_replay_truncate, /* TX_TRUNCATE */ zfs_replay_setattr, /* TX_SETATTR */ zfs_replay_acl, /* TX_ACL */ }; Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 175201) +++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 175202) @@ -1,1021 +1,1021 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_debug_level = 0; TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, "Debug level"); static int zfs_mount(vfs_t *vfsp, kthread_t *td); static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); static struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_fhtovp = zfs_fhtovp, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; int error; error = vfs_stdsync(vfsp, waitfor, td); if (error != 0) return (error); ZFS_ENTER(zfsvfs); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval < SPA_MINBLOCKSIZE || newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) newval = SPA_MAXBLOCKSIZE; zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->vfs_bsize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_refresh_properties(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Remount operations default to "rw" unless "ro" is explicitly * specified. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { readonly_changed_cb(zfsvfs, B_TRUE); } else { if (!dmu_objset_is_snapshot(zfsvfs->z_os)) readonly_changed_cb(zfsvfs, B_FALSE); else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) return (EROFS); } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid_changed_cb(zfsvfs, B_FALSE); } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) setuid_changed_cb(zfsvfs, B_FALSE); else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) setuid_changed_cb(zfsvfs, B_TRUE); } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) exec_changed_cb(zfsvfs, B_FALSE); else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) exec_changed_cb(zfsvfs, B_TRUE); if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) atime_changed_cb(zfsvfs, B_TRUE); else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) atime_changed_cb(zfsvfs, B_FALSE); if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) xattr_changed_cb(zfsvfs, B_TRUE); else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) xattr_changed_cb(zfsvfs, B_FALSE); return (0); } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; int readonly, do_readonly = FALSE; int setuid, do_setuid = FALSE; int exec, do_exec = FALSE; int xattr, do_xattr = FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "xattr", xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "recordsize", blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "readonly", readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "setuid", setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "exec", exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclmode", acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); return (0); unregister: /* * We may attempt to unregister some callbacks that are not * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); return (error); } static int zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) { cred_t *cr = td->td_ucred; uint64_t recordsize, readonly; int error = 0; int mode; zfsvfs_t *zfsvfs; znode_t *zp = NULL; ASSERT(vfsp); ASSERT(osname); /* * Initialize the zfs-specific filesystem structure. * Should probably make this a kmem cache, shuffle fields, * and just bzero up to z_hold_mtx[]. */ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_vfs = vfsp; zfsvfs->z_parent = zfsvfs; zfsvfs->z_assign = TXG_NOWAIT; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_MPSAFE; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) goto out; if (readonly) mode = DS_MODE_PRIMARY | DS_MODE_READONLY; else mode = DS_MODE_PRIMARY; error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); if (error == EROFS) { mode = DS_MODE_PRIMARY | DS_MODE_READONLY; error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); } if (error) goto out; if (error = zfs_init_fs(zfsvfs, &zp, cr)) goto out; if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t xattr; ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL)) goto out; xattr_changed_cb(zfsvfs, xattr); zfsvfs->z_issnap = B_TRUE; } else { error = zfs_register_callbacks(vfsp); if (error) goto out; zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. */ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector); if (!zil_disable) zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { if (zfsvfs->z_os) dmu_objset_close(zfsvfs->z_os); rw_destroy(&zfsvfs->z_um_lock); mutex_destroy(&zfsvfs->z_znodes_lock); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } else { atomic_add_32(&zfs_active_fs_count, 1); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; struct dsl_dataset *ds; /* * Unregister properties. */ if (!dmu_objset_is_snapshot(os)) { ds = dmu_objset_ds(os); VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); } } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp, kthread_t *td) { char *from; int error; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) return (zfs_refresh_properties(vfsp)); if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); DROP_GIANT(); error = zfs_domount(vfsp, from, td); PICKUP_GIANT(); return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = zfsvfs->z_vfs->vfs_bsize; statp->f_iosize = zfsvfs->z_vfs->vfs_bsize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = ZFS_MAXNAMELEN; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) { *vpp = ZTOV(rootzp); - error = vn_lock(*vpp, flags, td); + error = vn_lock(*vpp, flags); (*vpp)->v_vflag |= VV_ROOT; } ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; cred_t *cr = td->td_ucred; int ret; if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) return (ret); (void) dnlc_purge_vfsp(vfsp, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) return (EBUSY); ASSERT(zfsvfs->z_ctldir->v_count == 1); } zfsctl_destroy(zfsvfs); ASSERT(zfsvfs->z_ctldir == NULL); } /* * Flush all the files. */ ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); ASSERT(zfsvfs->z_ctldir != NULL); } return (ret); } if (fflag & MS_FORCE) { MNT_ILOCK(vfsp); vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(vfsp); zfsvfs->z_unmounted1 = B_TRUE; /* * Wait for all zfs threads to leave zfs. * Grabbing a rwlock as reader in all vops and * as writer here doesn't work because it too easy to get * multiple reader enters as zfs can re-enter itself. * This can lead to deadlock if there is an intervening * rw_enter as writer. * So a file system threads ref count (z_op_cnt) is used. * A polling loop on z_op_cnt may seem inefficient, but * - this saves all threads on exit from having to grab a * mutex in order to cv_signal * - only occurs on forced unmount in the rare case when * there are outstanding threads within the file system. */ while (zfsvfs->z_op_cnt) { delay(1); } } zfs_objset_close(zfsvfs); VFS_RELE(vfsp); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { VN_RELE(ZTOV(zp)); err = EINVAL; } if (err != 0) *vpp = NULL; else { *vpp = ZTOV(zp); - vn_lock(*vpp, flags, curthread); + vn_lock(*vpp, flags); } ZFS_EXIT(zfsvfs); return (err); } static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (EINVAL); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (EINVAL); } /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *vpp = zfsvfs->z_ctldir; ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 0, NULL, NULL) == 0); } else { VN_HOLD(*vpp); } ZFS_EXIT(zfsvfs); /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } zp_gen = zp->z_phys->zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); VN_RELE(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (EINVAL); } *vpp = ZTOV(zp); /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); vnode_create_vobject(*vpp, zp->z_phys->zp_size, td); ZFS_EXIT(zfsvfs); return (0); } static void zfs_objset_close(zfsvfs_t *zfsvfs) { znode_t *zp, *nextzp; objset_t *os = zfsvfs->z_os; /* * For forced unmount, at this point all vops except zfs_inactive * are erroring EIO. We need to now suspend zfs_inactive threads * while we are freeing dbufs before switching zfs_inactive * to use behaviour without a objset. */ rw_enter(&zfsvfs->z_um_lock, RW_WRITER); /* * Release all holds on dbufs * Note, although we have stopped all other vop threads and * zfs_inactive(), the dmu can callback via znode_pageout_func() * which can zfs_znode_free() the znode. * So we lock z_all_znodes; search the list for a held * dbuf; drop the lock (we know zp can't disappear if we hold * a dbuf lock; then regrab the lock and restart. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { nextzp = list_next(&zfsvfs->z_all_znodes, zp); if (zp->z_dbuf_held) { /* dbufs should only be held when force unmounting */ zp->z_dbuf_held = 0; mutex_exit(&zfsvfs->z_znodes_lock); dmu_buf_rele(zp->z_dbuf, NULL); /* Start again */ mutex_enter(&zfsvfs->z_znodes_lock); nextzp = list_head(&zfsvfs->z_all_znodes); } } mutex_exit(&zfsvfs->z_znodes_lock); /* * Unregister properties. */ if (!dmu_objset_is_snapshot(os)) zfs_unregister_callbacks(zfsvfs); /* * Switch zfs_inactive to behaviour without an objset. * It just tosses cached pages and frees the znode & vnode. * Then re-enable zfs_inactive threads in that new behaviour. */ zfsvfs->z_unmounted2 = B_TRUE; rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ /* * Close the zil. Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } /* * Evict all dbufs so that cached znodes will be freed */ if (dmu_objset_evict_dbufs(os, 1)) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); (void) dmu_objset_evict_dbufs(os, 0); } /* * Finally close the objset */ dmu_objset_close(os); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; int i; for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); rw_destroy(&zfsvfs->z_um_lock); mutex_destroy(&zfsvfs->z_znodes_lock); kmem_free(zfsvfs, sizeof (zfsvfs_t)); atomic_add_32(&zfs_active_fs_count, -1); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int val; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes == val) desiredvnodes = (3 * desiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version " ZFS_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); } void zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 175201) +++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 175202) @@ -1,3599 +1,3599 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Portions Copyright 2007 Jeremy Teo */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait the the intent log to commit if it's is a synchronous operation. * Morover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). * In normal operation, this will be TXG_NOWAIT. During ZIL replay, * it will be a specific txg. Either way, dmu_tx_assign() never blocks. * This is critical because we don't want to block while holding locks. * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing to * use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, seq, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr) { znode_t *zp = VTOZ(*vpp); /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { znode_t *zp = VTOZ(vp); /* Decrement the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_dec_32(&zp->z_sync_cnt); /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_phys->zp_size; if (noff >= file_sz) { return (ENXIO); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); /* end of file? */ if ((error == ESRCH) || (noff > file_sz)) { /* * Handle the virtual hole at the end of file. */ if (hole) { *off = file_sz; return (0); } return (ENXIO); } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp) { offset_t off; int error; zfsvfs_t *zfsvfs; switch (com) { case _FIOFFS: return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ case _FIOGDIO: case _FIOSDIO: return (0); case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (EFAULT); zfsvfs = VTOZ(vp)->z_zfsvfs; ZFS_ENTER(zfsvfs); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (EFAULT); return (0); } return (ENOTTY); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; vm_page_t m; struct sf_buf *sf; int64_t start, off; int len = nbytes; int error = 0; uint64_t dirbytes; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; dirbytes = 0; VM_OBJECT_LOCK(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { uint64_t bytes = MIN(PAGESIZE - off, len); uint64_t fsize; again: if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && vm_page_is_valid(m, (vm_offset_t)off, bytes)) { uint64_t woff; caddr_t va; if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) goto again; fsize = obj->un_pager.vnp.vnp_size; vm_page_busy(m); vm_page_lock_queues(); vm_page_undirty(m); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (dirbytes > 0) { error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); dirbytes = 0; } if (error == 0) { sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); woff = uio->uio_loffset - off; error = uiomove(va + off, bytes, UIO_WRITE, uio); /* * The uiomove() above could have been partially * successful, that's why we call dmu_write() * below unconditionally. The page was marked * non-dirty above and we would lose the changes * without doing so. If the uiomove() failed * entirely, well, we just write what we got * before one more time. */ dmu_write(os, zp->z_id, woff, MIN(PAGESIZE, fsize - woff), va, tx); sf_buf_free(sf); sched_unpin(); } VM_OBJECT_LOCK(obj); vm_page_wakeup(m); } else { dirbytes += bytes; } len -= bytes; off = 0; if (error) break; } VM_OBJECT_UNLOCK(obj); if (error == 0 && dirbytes > 0) error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; vm_page_t m; struct sf_buf *sf; int64_t start, off; caddr_t va; int len = nbytes; int error = 0; uint64_t dirbytes; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; dirbytes = 0; VM_OBJECT_LOCK(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { uint64_t bytes = MIN(PAGESIZE - off, len); again: if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && vm_page_is_valid(m, (vm_offset_t)off, bytes)) { if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) goto again; vm_page_busy(m); VM_OBJECT_UNLOCK(obj); if (dirbytes > 0) { error = dmu_read_uio(os, zp->z_id, uio, dirbytes); dirbytes = 0; } if (error == 0) { sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = uiomove(va + off, bytes, UIO_READ, uio); sf_buf_free(sf); sched_unpin(); } VM_OBJECT_LOCK(obj); vm_page_wakeup(m); } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { /* * The code below is here to make sendfile(2) work * correctly with ZFS. As pointed out by ups@ * sendfile(2) should be changed to use VOP_GETPAGES(), * but it pessimize performance of sendfile/UFS, that's * why I handle this special case in ZFS code. */ if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) goto again; vm_page_busy(m); VM_OBJECT_UNLOCK(obj); if (dirbytes > 0) { error = dmu_read_uio(os, zp->z_id, uio, dirbytes); dirbytes = 0; } if (error == 0) { sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = dmu_read(os, zp->z_id, start + off, bytes, (void *)(va + off)); sf_buf_free(sf); sched_unpin(); } VM_OBJECT_LOCK(obj); vm_page_wakeup(m); if (error == 0) uio->uio_resid -= bytes; } else { dirbytes += bytes; } len -= bytes; off = 0; if (error) break; } VM_OBJECT_UNLOCK(obj); if (error == 0 && dirbytes > 0) error = dmu_read_uio(os, zp->z_id, uio, dirbytes); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 if success * error code if failure * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; ssize_t n, nbytes; int error; rl_t *rl; ZFS_ENTER(zfsvfs); /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (ioflag & FRSYNC) zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_phys->zp_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_phys->zp_size); n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (vn_has_cached_data(vp)) error = mappedread(vp, nbytes, uio); else error = dmu_read_uio(os, zp->z_id, uio, nbytes); if (error) break; n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Fault in the pages of the first n bytes specified by the uio structure. * 1 byte in each page is touched and the uio struct is unmodified. * Any error will exit this routine as this is only a best * attempt to get the pages resident. This is a copy of ufs_trans_touch(). */ static void zfs_prefault_write(ssize_t n, struct uio *uio) { struct iovec *iov; ulong_t cnt, incr; caddr_t p; if (uio->uio_segflg != UIO_USERSPACE) return; iov = uio->uio_iov; while (n) { cnt = MIN(iov->iov_len, n); if (cnt == 0) { /* empty iov entry */ iov++; continue; } n -= cnt; /* * touch each page in this segment. */ p = iov->iov_base; while (cnt) { if (fubyte(p) == -1) return; incr = MIN(cnt, PAGESIZE); p += incr; cnt -= incr; } /* * touch the last byte in case it straddles a page. */ p--; if (fubyte(p) == -1) return; iov++; } } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - IO_APPEND flag set if in append mode. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. */ zfs_prefault_write(n, uio); /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & IO_APPEND) { /* * Range lock for a file append: * The value for the start of range will be determined by * zfs_range_lock() (to guarantee append semantics). * If this write will cause the block size to increase, * zfs_range_lock() will lock the entire file, so we must * later reduce the range after we grow the block size. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); if (rl->r_len == UINT64_MAX) { /* overlocked, zp_size can't change */ woff = uio->uio_loffset = zp->z_phys->zp_size; } else { woff = uio->uio_loffset = rl->r_off; } } else { woff = uio->uio_loffset; /* * Validate file offset */ if (woff < 0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * If we need to grow the block size then zfs_range_lock() * will lock a wider range than we request here. * Later after growing the block size we reduce the range. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* * Check for mandatory locks */ if (MANDMODE((mode_t)zp->z_phys->zp_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (error); } end_size = MAX(zp->z_phys->zp_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { /* * Start a transaction. */ woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); continue; } dmu_tx_abort(tx); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_phys->zp_size) vnode_pager_setsize(vp, woff + nbytes); rw_enter(&zp->z_map_lock, RW_READER); tx_bytes = uio->uio_resid; if (vn_has_cached_data(vp)) { rw_exit(&zp->z_map_lock); error = mappedwrite(vp, nbytes, uio, tx); } else { error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); rw_exit(&zp->z_map_lock); } tx_bytes -= uio->uio_resid; /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) { zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); } mutex_exit(&zp->z_acl_lock); /* * Update time stamp. NOTE: This marks the bonus buffer as * dirty, so we don't have to do it again for zp_size. */ zfs_time_stamper(zp, CONTENT_MODIFIED, tx); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, uio->uio_loffset); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC)) zil_commit(zilog, zp->z_last_itx, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(dmu_buf_t *db, void *vzgd) { zgd_t *zgd = (zgd_t *)vzgd; rl_t *rl = zgd->zgd_rl; vnode_t *vp = ZTOV(rl->r_zp); int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); VN_RELE(vp); zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); kmem_free(zgd, sizeof (zgd_t)); VFS_UNLOCK_GIANT(vfslocked); } /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t off = lr->lr_offset; dmu_buf_t *db; rl_t *rl; zgd_t *zgd; int dlen = lr->lr_length; /* length of user data */ int error = 0; ASSERT(zio); ASSERT(dlen != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { VN_RELE(ZTOV(zp)); return (ENOENT); } /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ rl = zfs_range_lock(zp, off, dlen, RL_READER); /* test for truncation needs to be done while range locked */ if (off >= zp->z_phys->zp_size) { error = ENOENT; goto out; } VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); } else { /* indirect write */ uint64_t boff; /* block starting offset */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { if (ISP2(zp->z_blksz)) { boff = P2ALIGN_TYPED(off, zp->z_blksz, uint64_t); } else { boff = 0; } dlen = zp->z_blksz; rl = zfs_range_lock(zp, boff, dlen, RL_READER); if (zp->z_blksz == dlen) break; zfs_range_unlock(rl); } /* test for truncation needs to be done while range locked */ if (off >= zp->z_phys->zp_size) { error = ENOENT; goto out; } zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_rl = rl; zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_bp = &lr->lr_blkptr; VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); ASSERT(boff == db->db_offset); lr->lr_blkoff = off - boff; error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz); if (error == 0) { zil_add_vdev(zfsvfs->z_log, DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); } /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before * we can release this dbuf. We will finish everything * up in the zfs_get_done() callback. */ if (error == EINPROGRESS) return (0); dmu_buf_rele(db, zgd); kmem_free(zgd, sizeof (zgd_t)); } out: zfs_range_unlock(rl); VN_RELE(ZTOV(zp)); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); error = zfs_zaccess_rwx(zp, mode, cr); ZFS_EXIT(zfsvfs); return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 if success * error code if failure * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); *vpp = NULL; #ifdef TODO if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_phys->zp_flags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { VN_RELE(*vpp); } ZFS_EXIT(zfsvfs); return (error); } #endif /* TODO */ if (dvp->v_type != VDIR) { ZFS_EXIT(zfsvfs); return (ENOTDIR); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { /* * Convert device special files */ if (IS_DEVVP(*vpp)) { vnode_t *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) error = ENOSYS; else *vpp = svp; } } ZFS_EXIT(zfsvfs); /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { int ltype = 0; if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp, td); VOP_UNLOCK(dvp, 0, td); } - error = vn_lock(*vpp, cnp->cn_lkflags, td); + error = vn_lock(*vpp, cnp->cn_lkflags); if (cnp->cn_flags & ISDOTDOT) - vn_lock(dvp, ltype | LK_RETRY, td); + vn_lock(dvp, ltype | LK_RETRY); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; return (error); } } #ifdef FREEBSD_NAMECACHE /* * Insert name into cache (as non-existent) if appropriate. */ if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, *vpp, cnp); /* * Insert name into cache if appropriate. */ if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } #endif return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; objset_t *os = zfsvfs->z_os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uint64_t zoid; ZFS_ENTER(zfsvfs); top: *vpp = NULL; if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~VSVTX; if (*name == '\0') { /* * Null component name refers to the directory itself. */ VN_HOLD(dvp); zp = dzp; dl = NULL; error = 0; } else { /* possible VN_HOLD(zp) */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); return (error); } } zoid = zp ? zp->z_id : -1ULL; if (zp == NULL) { /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_phys->zp_flags & ZFS_XATTR) && (vap->va_type != VREG)) { error = EINVAL; goto out; } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); ASSERT(zp->z_id == zoid); (void) zfs_link_create(dl, zp, tx, ZNEW); zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); dmu_tx_commit(tx); } else { /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl == EXCL) { error = EEXIST; goto out; } /* * Can't open a directory for writing. */ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { error = EISDIR; goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { error = zfs_freesp(zp, 0, 0, mode, TRUE); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { /* NB: we already did dmu_tx_wait() */ zfs_dirent_unlock(dl); VN_RELE(ZTOV(zp)); goto top; } } } out: if (error == 0) { *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); } if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); /* * If vnode is for a device return a specfs vnode instead. */ if (IS_DEVVP(*vpp)) { struct vnode *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) { error = ENOSYS; } *vpp = svp; } } ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); znode_t *xzp = NULL; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t acl_obj, xattr_obj; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked; int error; ZFS_ENTER(zfsvfs); top: /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = EPERM; goto out; } vnevent_remove(vp); dnlc_remove(dvp, name); may_delete_now = FALSE; /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); if (may_delete_now) dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); /* are there any extended attributes? */ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { /* XXX - do we need this if we are deleting? */ dmu_tx_hold_bonus(tx, xattr_obj); } /* are there any additional acls */ if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, 0, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (0 && unlinked) { VI_LOCK(vp); delete_now = may_delete_now && vp->v_count == 1 && !vn_has_cached_data(vp) && zp->z_phys->zp_xattr == xattr_obj && zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; VI_UNLOCK(vp); } if (delete_now) { if (zp->z_phys->zp_xattr) { error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); ASSERT3U(error, ==, 0); ASSERT3U(xzp->z_phys->zp_links, ==, 2); dmu_buf_will_dirty(xzp->z_dbuf, tx); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; xzp->z_phys->zp_links = 0; mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); zp->z_phys->zp_xattr = 0; /* probably unnecessary */ } mutex_enter(&zp->z_lock); VI_LOCK(vp); vp->v_count--; ASSERT3U(vp->v_count, ==, 0); VI_UNLOCK(vp); mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); VFS_RELE(zfsvfs->z_vfs); } else if (unlinked) { zfs_unlinked_add(zp, tx); } zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); dmu_tx_commit(tx); out: zfs_dirent_unlock(dl); if (!delete_now) { VN_RELE(vp); } else if (xzp) { /* this rele delayed to prevent nesting transactions */ VN_RELE(ZTOV(xzp)); } ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * * OUT: vpp - vnode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; zfs_dirlock_t *dl; uint64_t zoid = 0; dmu_tx_t *tx; int error; ASSERT(vap->va_type == VDIR); ZFS_ENTER(zfsvfs); if (dzp->z_phys->zp_flags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } top: *vpp = NULL; /* * First make sure the new directory doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); /* * Now put new name in parent dir. */ (void) zfs_link_create(dl, zp, tx, ZNEW); *vpp = ZTOV(zp); zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); dmu_tx_commit(tx); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated */ static int zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } if (vp == cwd) { error = EINVAL; goto out; } vnevent_rmdir(vp); /* * Grab a lock on the directory to make sure that noone is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } #ifdef FREEBSD_NAMECACHE cache_purge(dvp); #endif error = zfs_link_destroy(dl, zp, tx, 0, NULL); if (error == 0) zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); #ifdef FREEBSD_NAMECACHE cache_purge(vp); #endif out: zfs_dirent_unlock(dl); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure. * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ int local_eof; int outcount; int error; uint8_t prefetch; uint8_t type; int ncooks; u_long *cooks = NULL; ZFS_ENTER(zfsvfs); /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; odp = (struct dirent64 *)iovp->iov_base; } if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); objnum = zp->z_phys->zp_parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = ENXIO; goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); } reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = EINVAL; goto update; } break; } /* * Add this entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; outcount += reclen; odp = (dirent64_t *)((intptr_t)odp + reclen); ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0); /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * flags - [UNUSED] * cr - credentials of caller. * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_phys_t *pzp = zp->z_phys; uint32_t blksize; u_longlong_t nblocks; int error; ZFS_ENTER(zfsvfs); /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ mutex_enter(&zp->z_lock); vap->va_type = IFTOVT(pzp->zp_mode); vap->va_mode = pzp->zp_mode & ~S_IFMT; vap->va_uid = zp->z_phys->zp_uid; vap->va_gid = zp->z_phys->zp_gid; vap->va_nodeid = zp->z_id; vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ vap->va_size = pzp->zp_size; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && (zp->z_phys->zp_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zp->z_lock); dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * flags - ATTR_UTIME set if non-default time values provided. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { struct znode *zp = VTOZ(vp); znode_phys_t *pzp = zp->z_phys; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; dmu_tx_t *tx; vattr_t oldva; uint_t mask = vap->va_mask; uint_t saved_mask; int trim_mask = 0; uint64_t new_mode; znode_t *attrzp; int need_policy = FALSE; int err; if (mask == 0) return (0); if (mask & AT_NOSET) return (EINVAL); if (mask & AT_SIZE && vp->v_type == VDIR) return (EISDIR); if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) return (EINVAL); ZFS_ENTER(zfsvfs); top: attrzp = NULL; if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (EROFS); } /* * First validate permissions */ if (mask & AT_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ do { err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); /* NB: we already did dmu_tx_wait() if necessary */ } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME)) need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = pzp->zp_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = pzp->zp_mode; oldva.va_uid = zp->z_phys->zp_uid; oldva.va_gid = zp->z_phys->zp_gid; mutex_exit(&zp->z_lock); if (mask & AT_MODE) { if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_phys->zp_acl.z_acl_extern_obj) dmu_tx_hold_write(tx, pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); else dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); } if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); if (err) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (err); } dmu_tx_hold_bonus(tx, attrzp->z_id); } err = dmu_tx_assign(tx, zfsvfs->z_assign); if (err) { if (attrzp) VN_RELE(ZTOV(attrzp)); if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (err); } dmu_buf_will_dirty(zp->z_dbuf, tx); /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ mutex_enter(&zp->z_lock); if (mask & AT_MODE) { err = zfs_acl_chmod_setattr(zp, new_mode, tx); ASSERT3U(err, ==, 0); } if (attrzp) mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { zp->z_phys->zp_uid = (uint64_t)vap->va_uid; if (attrzp) { attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; } } if (mask & AT_GID) { zp->z_phys->zp_gid = (uint64_t)vap->va_gid; if (attrzp) attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; } if (attrzp) mutex_exit(&attrzp->z_lock); if (mask & AT_ATIME) ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); if (mask & AT_MTIME) ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); if (mask & AT_SIZE) zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); else if (mask != 0) zfs_time_stamper_locked(zp, STATE_CHANGED, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); mutex_exit(&zp->z_lock); if (attrzp) VN_RELE(ZTOV(attrzp)); dmu_tx_commit(tx); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) VN_RELE(ZTOV(zl->zl_znode)); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; uint64_t *oidp = &zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = &zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (*oidp == szp->z_id) /* We're a descendant of szp */ return (EINVAL); if (*oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); if (error) return (error); zl->zl_znode = zp; } oidp = &zp->z_phys->zp_parent; rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ static int zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr, error; ZFS_ENTER(zfsvfs); /* * Make sure we have the real vp for the target directory. */ if (VOP_REALVP(tdvp, &realvp) == 0) tdvp = realvp; if (tdvp->v_vfsp != sdvp->v_vfsp) { ZFS_EXIT(zfsvfs); return (EXDEV); } tdzp = VTOZ(tdvp); top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != (sdzp->z_phys->zp_flags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { cmp = strcmp(snm, tnm); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) VN_RELE(ZTOV(tzp)); } if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); VN_RELE(ZTOV(szp)); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto out; if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (ZTOV(szp)->v_type == VDIR) { if (ZTOV(tzp)->v_type != VDIR) { error = ENOTDIR; goto out; } } else { if (ZTOV(tzp)->v_type == VDIR) { error = EISDIR; goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } vnevent_rename_src(ZTOV(szp)); if (tzp) vnevent_rename_dest(ZTOV(tzp)); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ if (tzp) dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); ASSERT(error == 0); zfs_log_rename(zilog, tx, TX_RENAME, sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); } #ifdef FREEBSD_NAMECACHE if (error == 0) { cache_purge(sdvp); cache_purge(tdvp); } #endif } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * target - Target path of new symlink. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated */ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t zoid; int len = strlen(link); int error; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); top: if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (ENAMETOOLONG); } /* * Attempt to lock directory; fail if entry already exists. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } dmu_buf_will_dirty(dzp->z_dbuf, tx); /* * Create a new object for the symlink. * Put the link content into bonus buffer if it will fit; * otherwise, store it just like any other file data. */ zoid = 0; if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. */ zfs_grow_blocksize(zp, len, tx); VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); bcopy(link, dbp->db_data, len); dmu_buf_rele(dbp, FTAG); } zp->z_phys->zp_size = len; /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); out: if (error == 0) { zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link); *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); } dmu_tx_commit(tx); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uoip - structure to contain the link path. * cr - credentials of caller. * * OUT: uio - structure to contain the link path. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; size_t bufsz; int error; ZFS_ENTER(zfsvfs); bufsz = (size_t)zp->z_phys->zp_size; if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { error = uiomove(zp->z_phys + 1, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); } else { dmu_buf_t *dbp; error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); if (error) { ZFS_EXIT(zfsvfs); return (error); } error = uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); dmu_buf_rele(dbp, FTAG); } ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; int error; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); if (VOP_REALVP(svp, &realvp) == 0) svp = realvp; if (svp->v_vfsp != tdvp->v_vfsp) { ZFS_EXIT(zfsvfs); return (EXDEV); } szp = VTOZ(svp); top: /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_phys->zp_flags & ZFS_XATTR) != (dzp->z_phys->zp_flags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (EPERM); } if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dl, szp, tx, 0); if (error == 0) zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); dmu_tx_commit(tx); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } void zfs_inactive(vnode_t *vp, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_um_lock, RW_READER); if (zfsvfs->z_unmounted2) { ASSERT(zp->z_dbuf_held == 0); mutex_enter(&zp->z_lock); VI_LOCK(vp); vp->v_count = 0; /* count arrives as 1 */ VI_UNLOCK(vp); if (zp->z_dbuf == NULL) { mutex_exit(&zp->z_lock); zfs_znode_free(zp); } else { mutex_exit(&zp->z_lock); } rw_exit(&zfsvfs->z_um_lock); VFS_RELE(zfsvfs->z_vfs); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_buf_will_dirty(zp->z_dbuf, tx); mutex_enter(&zp->z_lock); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); rw_exit(&zfsvfs->z_um_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); static int zfs_fid(vnode_t *vp, fid_t *fidp) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen = (uint32_t)zp->z_phys->zp_gen; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i; ZFS_ENTER(zfsvfs); size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; fidp->fid_len = size; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; zfs_dirlock_t *dl; int error; switch (cmd) { case _PC_LINK_MAX: *valp = INT_MAX; return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #if 0 case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); *valp = 0; error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { zfs_dirent_unlock(dl); if (!zfs_dirempty(xzp)) *valp = 1; VN_RELE(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); #endif case _PC_ACL_EXTENDED: *valp = 0; /* TODO */ return (0); case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); default: return (EOPNOTSUPP); } } #ifdef TODO /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); error = zfs_getacl(zp, vsecp, cr); ZFS_EXIT(zfsvfs); return (error); } #endif /* TODO */ #ifdef TODO /*ARGSUSED*/ static int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); error = zfs_setacl(zp, vsecp, cr); ZFS_EXIT(zfsvfs); return (error); } #endif /* TODO */ static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred); if (error == 0) vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL)); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred)); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred)); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vattr_t *vap = ap->a_vap; /* No support for FreeBSD's chflags(2). */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & SAVENAME); ASSERT(ap->a_tcnp->cn_flags & SAVENAME); error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred); if (tdvp == tvp) VN_RELE(tdvp); else VN_URELE(tdvp); if (tvp) VN_URELE(tvp); VN_RELE(fdvp); VN_RELE(fvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, ap->a_target, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs; int rele = 1; ASSERT(zp != NULL); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); mutex_enter(&zp->z_lock); ASSERT(zp->z_phys); ASSERT(zp->z_dbuf_held); zfsvfs = zp->z_zfsvfs; if (!zp->z_unlinked) { zp->z_dbuf_held = 0; ZTOV(zp) = NULL; mutex_exit(&zp->z_lock); dmu_buf_rele(zp->z_dbuf, NULL); } else { mutex_exit(&zp->z_lock); } VI_LOCK(vp); if (vp->v_count > 0) rele = 0; vp->v_data = NULL; ASSERT(vp->v_holdcnt >= 1); VI_UNLOCK(vp); if (!zp->z_unlinked && rele) VFS_RELE(zfsvfs->z_vfs); return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred); if (error == 0) *ap->a_retval = val; else if (error == EOPNOTSUPP) error = vop_stdpathconf(ap); return (error); } /* * Advisory record locking support */ static int zfs_freebsd_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { znode_t *zp = VTOZ(ap->a_vp); return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size)); } struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, #ifdef FREEBSD_NAMECACHE .vop_lookup = vfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, #else .vop_lookup = zfs_freebsd_lookup, #endif .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_advlock = zfs_freebsd_advlock, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = VOP_EOPNOTSUPP, .vop_fid = zfs_freebsd_fid, }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_fid = zfs_freebsd_fid, }; Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c =================================================================== --- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 175201) +++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 175202) @@ -1,1072 +1,1072 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Portions Copyright 2007 Jeremy Teo */ #pragma ident "%Z%%M% %I% %E% SMI" #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), "sizeof(znode_t)"); /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL struct kmem_cache *znode_cache = NULL; /*ARGSUSED*/ static void znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) { znode_t *zp = user_ptr; vnode_t *vp; mutex_enter(&zp->z_lock); vp = ZTOV(zp); if (vp == NULL) { mutex_exit(&zp->z_lock); zfs_znode_free(zp); } else if (vp->v_count == 0) { ZTOV(zp) = NULL; vhold(vp); mutex_exit(&zp->z_lock); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp, curthread); VOP_UNLOCK(vp, 0, curthread); vdrop(vp); zfs_znode_free(zp); } else { /* signal force unmount that this znode can be freed */ zp->z_dbuf = NULL; mutex_exit(&zp->z_lock); } } extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; /* * XXX: We cannot use this function as a cache constructor, because * there is one global cache for all file systems and we need * to pass vfsp here, which is not possible, because argument * 'cdrarg' is defined at kmem_cache_create() time. */ static int zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) { znode_t *zp = buf; vnode_t *vp; vfs_t *vfsp = cdrarg; int error; if (cdrarg != NULL) { error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); ASSERT(error == 0); zp->z_vnode = vp; vp->v_data = (caddr_t)zp; vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; } else { zp->z_vnode = NULL; } mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zp->z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); zp->z_dbuf_held = 0; zp->z_dirlocks = 0; zp->z_lockf = NULL; return (0); } /*ARGSUSED*/ static void zfs_znode_cache_destructor(void *buf, void *cdarg) { znode_t *zp = buf; ASSERT(zp->z_dirlocks == 0); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); mutex_destroy(&zp->z_range_lock); avl_destroy(&zp->z_range_avl); ASSERT(zp->z_dbuf_held == 0); } void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; } /* * zfs_init_fs - Initialize the zfsvfs struct and the file system * incore "master" object. Verify version compatibility. */ int zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) { objset_t *os = zfsvfs->z_os; uint64_t version = ZPL_VERSION; int i, error; dmu_object_info_t doi; uint64_t fsid_guid; *zpp = NULL; /* * XXX - hack to auto-create the pool root filesystem at * the first attempted mount. */ if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ error = dmu_tx_assign(tx, TXG_WAIT); ASSERT3U(error, ==, 0); zfs_create_fs(os, cr, tx); dmu_tx_commit(tx); } error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1, &version); if (error) { return (error); } else if (version != ZPL_VERSION) { (void) printf("Mismatched versions: File system " "is version %lld on-disk format, which is " "incompatible with this software version %lld!", (u_longlong_t)version, ZPL_VERSION); return (ENOTSUP); } /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error) return (error); ASSERT(zfsvfs->z_root != 0); /* * Create the per mount vop tables. */ /* * Initialize zget mutex's */ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); if (error) return (error); ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error) return (error); return (0); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif #ifndef major #define major(x) ((int)(((u_int)(x) >> 8)&0xff)) /* major number */ #endif #ifndef minor #define minor(x) ((int)((x)&0xffff00ff)) /* minor number */ #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } /* * Construct a new znode/vnode and intialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) { znode_t *zp; vnode_t *vp; int error; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0); ASSERT(zp->z_dirlocks == NULL); zp->z_phys = db->db_data; zp->z_zfsvfs = zfsvfs; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_dbuf_held = 0; zp->z_mapcnt = 0; zp->z_last_itx = 0; zp->z_dbuf = db; zp->z_id = obj_num; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); vp = ZTOV(zp); if (vp == NULL) return (zp); error = insmntque(vp, zfsvfs->z_vfs); KASSERT(error == 0, ("insmntque() failed: error %d", error)); vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; case VFIFO: vp->v_op = &zfs_fifoops; break; } return (zp); } static void zfs_znode_dmu_init(znode_t *zp) { znode_t *nzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_buf_t *db = zp->z_dbuf; mutex_enter(&zp->z_lock); nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func); /* * there should be no * concurrent zgets on this object. */ ASSERT3P(nzp, ==, NULL); /* * Slap on VROOT if we are the root znode */ if (zp->z_id == zfsvfs->z_root) { ZTOV(zp)->v_flag |= VROOT; } ASSERT(zp->z_dbuf_held == 0); zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); mutex_exit(&zp->z_lock); } /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * IS_REPLAY - intent log replay * * OUT: oid - ID of created object * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, int bonuslen) { dmu_buf_t *dbp; znode_phys_t *pzp; znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; timestruc_t now; uint64_t gen; int err; ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ *oid = vap->va_nodeid; flag |= IS_REPLAY; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { *oid = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); } /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be to needed allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (flag & IS_REPLAY) { err = zap_create_claim(zfsvfs->z_os, *oid, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); ASSERT3U(err, ==, 0); } else { *oid = zap_create(zfsvfs->z_os, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } else { if (flag & IS_REPLAY) { err = dmu_object_claim(zfsvfs->z_os, *oid, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); ASSERT3U(err, ==, 0); } else { *oid = dmu_object_alloc(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); dmu_buf_will_dirty(dbp, tx); /* * Initialize the znode physical data to zero. */ ASSERT(dbp->db_size >= sizeof (znode_phys_t)); bzero(dbp->db_data, dbp->db_size); pzp = dbp->db_data; /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_phys = pzp; dzp->z_id = *oid; } /* * If parent is an xattr, so am I. */ if (dzp->z_phys->zp_flags & ZFS_XATTR) flag |= IS_XATTR; if (vap->va_type == VBLK || vap->va_type == VCHR) { pzp->zp_rdev = zfs_expldev(vap->va_rdev); } if (vap->va_type == VDIR) { pzp->zp_size = 2; /* contents ("." and "..") */ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } pzp->zp_parent = dzp->z_id; if (flag & IS_XATTR) pzp->zp_flags |= ZFS_XATTR; pzp->zp_gen = gen; ZFS_TIME_ENCODE(&now, pzp->zp_crtime); ZFS_TIME_ENCODE(&now, pzp->zp_ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); } else { ZFS_TIME_ENCODE(&now, pzp->zp_atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); } else { ZFS_TIME_ENCODE(&now, pzp->zp_mtime); } pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); zfs_perm_init(zp, dzp, flag, vap, tx, cr); if (zpp) { kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); mutex_enter(hash_mtx); zfs_znode_dmu_init(zp); mutex_exit(hash_mtx); *zpp = zp; } else { if (ZTOV(zp) != NULL) ZTOV(zp)->v_count = 0; dmu_buf_rele(dbp, NULL); zfs_znode_free(zp); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; int err; *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_ZNODE || doi.doi_bonus_size < sizeof (znode_phys_t)) { dmu_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EINVAL); } ASSERT(db->db_object == obj_num); ASSERT(db->db_offset == -1); ASSERT(db->db_data != NULL); zp = dmu_buf_get_user(db); if (zp != NULL) { mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { dmu_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (ENOENT); } else if (zp->z_dbuf_held) { dmu_buf_rele(db, NULL); } else { zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); } if (ZTOV(zp) != NULL) VN_HOLD(ZTOV(zp)); else { err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops, &zp->z_vnode); ASSERT(err == 0); vp = ZTOV(zp); vp->v_data = (caddr_t)zp; vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); if (vp->v_type == VDIR) zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ err = insmntque(vp, zfsvfs->z_vfs); KASSERT(err == 0, ("insmntque() failed: error %d", err)); } mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; return (0); } /* * Not found create new znode/vnode */ zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); ASSERT3U(zp->z_id, ==, obj_num); zfs_znode_dmu_init(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); if (zp->z_phys->zp_acl.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, zp->z_phys->zp_acl.z_acl_extern_obj, tx); ASSERT3U(error, ==, 0); } error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); ASSERT3U(error, ==, 0); zp->z_dbuf_held = 0; ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); dmu_buf_rele(zp->z_dbuf, NULL); } void zfs_zinactive(znode_t *zp) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT(zp->z_dbuf_held && zp->z_phys); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); mutex_enter(&zp->z_lock); VI_LOCK(vp); if (vp->v_count > 0) { /* * If the hold count is greater than zero, somebody has * obtained a new reference on this znode while we were * processing it here, so we are done. */ VI_UNLOCK(vp); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); return; } VI_UNLOCK(vp); /* * If this was the last reference to a file with no links, * remove the file from the file system. */ if (zp->z_unlinked) { ZTOV(zp) = NULL; mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); ASSERT(vp->v_count == 0); vrecycle(vp, curthread); zfs_rmnode(zp); VFS_RELE(zfsvfs->z_vfs); return; } ASSERT(zp->z_phys); ASSERT(zp->z_dbuf_held); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; mutex_enter(&zfsvfs->z_znodes_lock); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); kmem_cache_free(znode_cache, zp); } void zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) { timestruc_t now; ASSERT(MUTEX_HELD(&zp->z_lock)); gethrestime(&now); if (tx) { dmu_buf_will_dirty(zp->z_dbuf, tx); zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); if (flag & AT_MTIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); if (flag & AT_CTIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); } /* * Update the requested znode timestamps with the current time. * If we are in a transaction, then go ahead and mark the znode * dirty in the transaction so the timestamps will go to disk. * Otherwise, we will get pushed next time the znode is updated * in a transaction, or when this znode eventually goes inactive. * * Why is this OK? * 1 - Only the ACCESS time is ever updated outside of a transaction. * 2 - Multiple consecutive updates will be collapsed into a single * znode update by the transaction grouping semantics of the DMU. */ void zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) { mutex_enter(&zp->z_lock); zfs_time_stamper_locked(zp, flag, tx); mutex_exit(&zp->z_lock); } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT3U(error, ==, 0); /* What blocksize did we actually get? */ dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free (0 => to EOF). * flag - current file open mode flags. * * RETURN: 0 if success * error code if failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; rl_t *rl; uint64_t end = off + len; uint64_t size, new_blksz; int error; if (ZTOV(zp)->v_type == VFIFO) return (0); /* * If we will change zp_size then lock the whole file, * otherwise just lock the range being freed. */ if (len == 0 || off + len > zp->z_phys->zp_size) { rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); } else { rl = zfs_range_lock(zp, off, len, RL_WRITER); /* recheck, in case zp_size changed */ if (off + len > zp->z_phys->zp_size) { /* lost race: file size changed, lock whole file */ zfs_range_unlock(rl); rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); } } /* * Nothing to do if file already at desired length. */ size = zp->z_phys->zp_size; if (len == 0 && size == off && off != 0) { zfs_range_unlock(rl); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); new_blksz = 0; if (end > size && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end, SPA_MAXBLOCKSIZE); } else { new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz)); } else if (off < size) { /* * If len == 0, we are truncating the file. */ dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) dmu_tx_wait(tx); dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } if (new_blksz) zfs_grow_blocksize(zp, new_blksz, tx); if (end > size || len == 0) zp->z_phys->zp_size = end; if (off < size) { objset_t *os = zfsvfs->z_os; uint64_t rlen = len; if (len == 0) rlen = -1; else if (end > size) rlen = size - off; VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx)); } if (log) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); } zfs_range_unlock(rl); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ rw_enter(&zp->z_map_lock, RW_WRITER); if (end > size) vnode_pager_setsize(vp, end); else if (len == 0) { #if 0 error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); #else error = vinvalbuf(vp, V_SAVE, curthread, 0, 0); vnode_pager_setsize(vp, end); #endif } rw_exit(&zp->z_map_lock); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) { zfsvfs_t zfsvfs; uint64_t moid, doid, roid = 0; uint64_t version = ZPL_VERSION; int error; znode_t *rootzp = NULL; vattr_t vattr; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx); ASSERT(error == 0); /* * Create a delete queue. */ doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = UID_ROOT; vattr.va_gid = GID_WHEEL; rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); zfs_znode_cache_constructor(rootzp, NULL, 0); rootzp->z_zfsvfs = &zfsvfs; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_dbuf_held = 0; bzero(&zfsvfs, sizeof (zfsvfs_t)); zfsvfs.z_os = os; zfsvfs.z_assign = TXG_NOWAIT; zfsvfs.z_parent = &zfsvfs; mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); ASSERT3U(rootzp->z_id, ==, roid); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); ASSERT(error == 0); mutex_destroy(&zfsvfs.z_znodes_lock); kmem_cache_free(znode_cache, rootzp); } #endif /* _KERNEL */ /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) { dmu_buf_t *db; dmu_object_info_t doi; znode_phys_t *zp; int error; if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) return (error); dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_ZNODE || doi.doi_bonus_size < sizeof (znode_phys_t)) { dmu_buf_rele(db, FTAG); return (EINVAL); } zp = db->db_data; *pobjp = zp->zp_parent; *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && S_ISDIR(zp->zp_mode); dmu_buf_rele(db, FTAG); return (0); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { char *path = buf + len - 1; int error; *path = '\0'; for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if ((error = zfs_obj_to_pobj(osp, obj, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); bcopy(component, path, complen); obj = pobj; } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } Index: head/sys/dev/hwpmc/hwpmc_mod.c =================================================================== --- head/sys/dev/hwpmc/hwpmc_mod.c (revision 175201) +++ head/sys/dev/hwpmc/hwpmc_mod.c (revision 175202) @@ -1,4627 +1,4627 @@ /*- * Copyright (c) 2003-2007 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* needs to be after */ #include #include /* * Types */ enum pmc_flags { PMC_FLAG_NONE = 0x00, /* do nothing */ PMC_FLAG_REMOVE = 0x01, /* atomically remove entry from hash */ PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */ }; /* * The offset in sysent where the syscall is allocated. */ static int pmc_syscall_num = NO_SYSCALL; struct pmc_cpu **pmc_pcpu; /* per-cpu state */ pmc_value_t *pmc_pcpu_saved; /* saved PMC values: CSW handling */ #define PMC_PCPU_SAVED(C,R) pmc_pcpu_saved[(R) + md->pmd_npmc*(C)] struct mtx_pool *pmc_mtxpool; static int *pmc_pmcdisp; /* PMC row dispositions */ #define PMC_ROW_DISP_IS_FREE(R) (pmc_pmcdisp[(R)] == 0) #define PMC_ROW_DISP_IS_THREAD(R) (pmc_pmcdisp[(R)] > 0) #define PMC_ROW_DISP_IS_STANDALONE(R) (pmc_pmcdisp[(R)] < 0) #define PMC_MARK_ROW_FREE(R) do { \ pmc_pmcdisp[(R)] = 0; \ } while (0) #define PMC_MARK_ROW_STANDALONE(R) do { \ KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&pmc_pmcdisp[(R)], -1); \ KASSERT(pmc_pmcdisp[(R)] >= (-mp_ncpus), ("[pmc,%d] row " \ "disposition error", __LINE__)); \ } while (0) #define PMC_UNMARK_ROW_STANDALONE(R) do { \ atomic_add_int(&pmc_pmcdisp[(R)], 1); \ KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ } while (0) #define PMC_MARK_ROW_THREAD(R) do { \ KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&pmc_pmcdisp[(R)], 1); \ } while (0) #define PMC_UNMARK_ROW_THREAD(R) do { \ atomic_add_int(&pmc_pmcdisp[(R)], -1); \ KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ } while (0) /* various event handlers */ static eventhandler_tag pmc_exit_tag, pmc_fork_tag; /* Module statistics */ struct pmc_op_getdriverstats pmc_stats; /* Machine/processor dependent operations */ struct pmc_mdep *md; /* * Hash tables mapping owner processes and target threads to PMCs. */ struct mtx pmc_processhash_mtx; /* spin mutex */ static u_long pmc_processhashmask; static LIST_HEAD(pmc_processhash, pmc_process) *pmc_processhash; /* * Hash table of PMC owner descriptors. This table is protected by * the shared PMC "sx" lock. */ static u_long pmc_ownerhashmask; static LIST_HEAD(pmc_ownerhash, pmc_owner) *pmc_ownerhash; /* * List of PMC owners with system-wide sampling PMCs. */ static LIST_HEAD(, pmc_owner) pmc_ss_owners; /* * Prototypes */ #ifdef DEBUG static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS); static int pmc_debugflags_parse(char *newstr, char *fence); #endif static int load(struct module *module, int cmd, void *arg); static int pmc_attach_process(struct proc *p, struct pmc *pm); static struct pmc *pmc_allocate_pmc_descriptor(void); static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p); static int pmc_attach_one_process(struct proc *p, struct pmc *pm); static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu); static int pmc_can_attach(struct pmc *pm, struct proc *p); static void pmc_capture_user_callchain(int cpu, struct trapframe *tf); static void pmc_cleanup(void); static int pmc_detach_process(struct proc *p, struct pmc *pm); static int pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags); static void pmc_destroy_owner_descriptor(struct pmc_owner *po); static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p); static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm); static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmc); static struct pmc_process *pmc_find_process_descriptor(struct proc *p, uint32_t mode); static void pmc_force_context_switch(void); static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp); static void pmc_log_all_process_mappings(struct pmc_owner *po); static void pmc_log_kernel_mappings(struct pmc *pm); static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p); static void pmc_maybe_remove_owner(struct pmc_owner *po); static void pmc_process_csw_in(struct thread *td); static void pmc_process_csw_out(struct thread *td); static void pmc_process_exit(void *arg, struct proc *p); static void pmc_process_fork(void *arg, struct proc *p1, struct proc *p2, int n); static void pmc_process_samples(int cpu); static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_remove_owner(struct pmc_owner *po); static void pmc_remove_process_descriptor(struct pmc_process *pp); static void pmc_restore_cpu_binding(struct pmc_binding *pb); static void pmc_save_cpu_binding(struct pmc_binding *pb); static void pmc_select_cpu(int cpu); static int pmc_start(struct pmc *pm); static int pmc_stop(struct pmc *pm); static int pmc_syscall_handler(struct thread *td, void *syscall_args); static void pmc_unlink_target_process(struct pmc *pmc, struct pmc_process *pp); /* * Kernel tunables and sysctl(8) interface. */ SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters"); static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "callchaindepth", &pmc_callchaindepth); SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_TUN|CTLFLAG_RD, &pmc_callchaindepth, 0, "depth of call chain records"); #ifdef DEBUG struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS; char pmc_debugstr[PMC_DEBUG_STRSIZE]; TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr, sizeof(pmc_debugstr)); SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_TUN, 0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags"); #endif /* * kern.hwpmc.hashrows -- determines the number of rows in the * of the hash table used to look up threads */ static int pmc_hashsize = PMC_HASH_SIZE; TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "hashsize", &pmc_hashsize); SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN|CTLFLAG_RD, &pmc_hashsize, 0, "rows in hash tables"); /* * kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU */ static int pmc_nsamples = PMC_NSAMPLES; TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nsamples", &pmc_nsamples); SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_TUN|CTLFLAG_RD, &pmc_nsamples, 0, "number of PC samples per CPU"); /* * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool. */ static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE; TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "mtxpoolsize", &pmc_mtxpool_size); SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_TUN|CTLFLAG_RD, &pmc_mtxpool_size, 0, "size of spin mutex pool"); /* * security.bsd.unprivileged_syspmcs -- allow non-root processes to * allocate system-wide PMCs. * * Allowing unprivileged processes to allocate system PMCs is convenient * if system-wide measurements need to be taken concurrently with other * per-process measurements. This feature is turned off by default. */ static int pmc_unprivileged_syspmcs = 0; TUNABLE_INT("security.bsd.unprivileged_syspmcs", &pmc_unprivileged_syspmcs); SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RW, &pmc_unprivileged_syspmcs, 0, "allow unprivileged process to allocate system PMCs"); /* * Hash function. Discard the lower 2 bits of the pointer since * these are always zero for our uses. The hash multiplier is * round((2^LONG_BIT) * ((sqrt(5)-1)/2)). */ #if LONG_BIT == 64 #define _PMC_HM 11400714819323198486u #elif LONG_BIT == 32 #define _PMC_HM 2654435769u #else #error Must know the size of 'long' to compile #endif #define PMC_HASH_PTR(P,M) ((((unsigned long) (P) >> 2) * _PMC_HM) & (M)) /* * Syscall structures */ /* The `sysent' for the new syscall */ static struct sysent pmc_sysent = { 2, /* sy_narg */ pmc_syscall_handler /* sy_call */ }; static struct syscall_module_data pmc_syscall_mod = { load, NULL, &pmc_syscall_num, &pmc_sysent, { 0, NULL } }; static moduledata_t pmc_mod = { PMC_MODULE_NAME, syscall_module_handler, &pmc_syscall_mod }; DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY); MODULE_VERSION(pmc, PMC_VERSION); #ifdef DEBUG enum pmc_dbgparse_state { PMCDS_WS, /* in whitespace */ PMCDS_MAJOR, /* seen a major keyword */ PMCDS_MINOR }; static int pmc_debugflags_parse(char *newstr, char *fence) { char c, *p, *q; struct pmc_debugflags *tmpflags; int error, found, *newbits, tmp; size_t kwlen; MALLOC(tmpflags, struct pmc_debugflags *, sizeof(*tmpflags), M_PMC, M_WAITOK|M_ZERO); p = newstr; error = 0; for (; p < fence && (c = *p); p++) { /* skip white space */ if (c == ' ' || c == '\t') continue; /* look for a keyword followed by "=" */ for (q = p; p < fence && (c = *p) && c != '='; p++) ; if (c != '=') { error = EINVAL; goto done; } kwlen = p - q; newbits = NULL; /* lookup flag group name */ #define DBG_SET_FLAG_MAJ(S,F) \ if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \ newbits = &tmpflags->pdb_ ## F; DBG_SET_FLAG_MAJ("cpu", CPU); DBG_SET_FLAG_MAJ("csw", CSW); DBG_SET_FLAG_MAJ("logging", LOG); DBG_SET_FLAG_MAJ("module", MOD); DBG_SET_FLAG_MAJ("md", MDP); DBG_SET_FLAG_MAJ("owner", OWN); DBG_SET_FLAG_MAJ("pmc", PMC); DBG_SET_FLAG_MAJ("process", PRC); DBG_SET_FLAG_MAJ("sampling", SAM); if (newbits == NULL) { error = EINVAL; goto done; } p++; /* skip the '=' */ /* Now parse the individual flags */ tmp = 0; newflag: for (q = p; p < fence && (c = *p); p++) if (c == ' ' || c == '\t' || c == ',') break; /* p == fence or c == ws or c == "," or c == 0 */ if ((kwlen = p - q) == 0) { *newbits = tmp; continue; } found = 0; #define DBG_SET_FLAG_MIN(S,F) \ if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \ tmp |= found = (1 << PMC_DEBUG_MIN_ ## F) /* a '*' denotes all possible flags in the group */ if (kwlen == 1 && *q == '*') tmp = found = ~0; /* look for individual flag names */ DBG_SET_FLAG_MIN("allocaterow", ALR); DBG_SET_FLAG_MIN("allocate", ALL); DBG_SET_FLAG_MIN("attach", ATT); DBG_SET_FLAG_MIN("bind", BND); DBG_SET_FLAG_MIN("config", CFG); DBG_SET_FLAG_MIN("exec", EXC); DBG_SET_FLAG_MIN("exit", EXT); DBG_SET_FLAG_MIN("find", FND); DBG_SET_FLAG_MIN("flush", FLS); DBG_SET_FLAG_MIN("fork", FRK); DBG_SET_FLAG_MIN("getbuf", GTB); DBG_SET_FLAG_MIN("hook", PMH); DBG_SET_FLAG_MIN("init", INI); DBG_SET_FLAG_MIN("intr", INT); DBG_SET_FLAG_MIN("linktarget", TLK); DBG_SET_FLAG_MIN("mayberemove", OMR); DBG_SET_FLAG_MIN("ops", OPS); DBG_SET_FLAG_MIN("read", REA); DBG_SET_FLAG_MIN("register", REG); DBG_SET_FLAG_MIN("release", REL); DBG_SET_FLAG_MIN("remove", ORM); DBG_SET_FLAG_MIN("sample", SAM); DBG_SET_FLAG_MIN("scheduleio", SIO); DBG_SET_FLAG_MIN("select", SEL); DBG_SET_FLAG_MIN("signal", SIG); DBG_SET_FLAG_MIN("swi", SWI); DBG_SET_FLAG_MIN("swo", SWO); DBG_SET_FLAG_MIN("start", STA); DBG_SET_FLAG_MIN("stop", STO); DBG_SET_FLAG_MIN("syscall", PMS); DBG_SET_FLAG_MIN("unlinktarget", TUL); DBG_SET_FLAG_MIN("write", WRI); if (found == 0) { /* unrecognized flag name */ error = EINVAL; goto done; } if (c == 0 || c == ' ' || c == '\t') { /* end of flag group */ *newbits = tmp; continue; } p++; goto newflag; } /* save the new flag set */ bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags)); done: FREE(tmpflags, M_PMC); return error; } static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS) { char *fence, *newstr; int error; unsigned int n; (void) arg1; (void) arg2; /* unused parameters */ n = sizeof(pmc_debugstr); MALLOC(newstr, char *, n, M_PMC, M_ZERO|M_WAITOK); (void) strlcpy(newstr, pmc_debugstr, n); error = sysctl_handle_string(oidp, newstr, n, req); /* if there is a new string, parse and copy it */ if (error == 0 && req->newptr != NULL) { fence = newstr + (n < req->newlen ? n : req->newlen + 1); if ((error = pmc_debugflags_parse(newstr, fence)) == 0) (void) strlcpy(pmc_debugstr, newstr, sizeof(pmc_debugstr)); } FREE(newstr, M_PMC); return error; } #endif /* * Concurrency Control * * The driver manages the following data structures: * * - target process descriptors, one per target process * - owner process descriptors (and attached lists), one per owner process * - lookup hash tables for owner and target processes * - PMC descriptors (and attached lists) * - per-cpu hardware state * - the 'hook' variable through which the kernel calls into * this module * - the machine hardware state (managed by the MD layer) * * These data structures are accessed from: * * - thread context-switch code * - interrupt handlers (possibly on multiple cpus) * - kernel threads on multiple cpus running on behalf of user * processes doing system calls * - this driver's private kernel threads * * = Locks and Locking strategy = * * The driver uses four locking strategies for its operation: * * - The global SX lock "pmc_sx" is used to protect internal * data structures. * * Calls into the module by syscall() start with this lock being * held in exclusive mode. Depending on the requested operation, * the lock may be downgraded to 'shared' mode to allow more * concurrent readers into the module. Calls into the module from * other parts of the kernel acquire the lock in shared mode. * * This SX lock is held in exclusive mode for any operations that * modify the linkages between the driver's internal data structures. * * The 'pmc_hook' function pointer is also protected by this lock. * It is only examined with the sx lock held in exclusive mode. The * kernel module is allowed to be unloaded only with the sx lock held * in exclusive mode. In normal syscall handling, after acquiring the * pmc_sx lock we first check that 'pmc_hook' is non-null before * proceeding. This prevents races between the thread unloading the module * and other threads seeking to use the module. * * - Lookups of target process structures and owner process structures * cannot use the global "pmc_sx" SX lock because these lookups need * to happen during context switches and in other critical sections * where sleeping is not allowed. We protect these lookup tables * with their own private spin-mutexes, "pmc_processhash_mtx" and * "pmc_ownerhash_mtx". * * - Interrupt handlers work in a lock free manner. At interrupt * time, handlers look at the PMC pointer (phw->phw_pmc) configured * when the PMC was started. If this pointer is NULL, the interrupt * is ignored after updating driver statistics. We ensure that this * pointer is set (using an atomic operation if necessary) before the * PMC hardware is started. Conversely, this pointer is unset atomically * only after the PMC hardware is stopped. * * We ensure that everything needed for the operation of an * interrupt handler is available without it needing to acquire any * locks. We also ensure that a PMC's software state is destroyed only * after the PMC is taken off hardware (on all CPUs). * * - Context-switch handling with process-private PMCs needs more * care. * * A given process may be the target of multiple PMCs. For example, * PMCATTACH and PMCDETACH may be requested by a process on one CPU * while the target process is running on another. A PMC could also * be getting released because its owner is exiting. We tackle * these situations in the following manner: * * - each target process structure 'pmc_process' has an array * of 'struct pmc *' pointers, one for each hardware PMC. * * - At context switch IN time, each "target" PMC in RUNNING state * gets started on hardware and a pointer to each PMC is copied into * the per-cpu phw array. The 'runcount' for the PMC is * incremented. * * - At context switch OUT time, all process-virtual PMCs are stopped * on hardware. The saved value is added to the PMCs value field * only if the PMC is in a non-deleted state (the PMCs state could * have changed during the current time slice). * * Note that since in-between a switch IN on a processor and a switch * OUT, the PMC could have been released on another CPU. Therefore * context switch OUT always looks at the hardware state to turn * OFF PMCs and will update a PMC's saved value only if reachable * from the target process record. * * - OP PMCRELEASE could be called on a PMC at any time (the PMC could * be attached to many processes at the time of the call and could * be active on multiple CPUs). * * We prevent further scheduling of the PMC by marking it as in * state 'DELETED'. If the runcount of the PMC is non-zero then * this PMC is currently running on a CPU somewhere. The thread * doing the PMCRELEASE operation waits by repeatedly doing a * pause() till the runcount comes to zero. * * The contents of a PMC descriptor (struct pmc) are protected using * a spin-mutex. In order to save space, we use a mutex pool. * * In terms of lock types used by witness(4), we use: * - Type "pmc-sx", used by the global SX lock. * - Type "pmc-sleep", for sleep mutexes used by logger threads. * - Type "pmc-per-proc", for protecting PMC owner descriptors. * - Type "pmc-leaf", used for all other spin mutexes. */ /* * save the cpu binding of the current kthread */ static void pmc_save_cpu_binding(struct pmc_binding *pb) { PMCDBG(CPU,BND,2, "%s", "save-cpu"); thread_lock(curthread); pb->pb_bound = sched_is_bound(curthread); pb->pb_cpu = curthread->td_oncpu; thread_unlock(curthread); PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu); } /* * restore the cpu binding of the current thread */ static void pmc_restore_cpu_binding(struct pmc_binding *pb) { PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d", curthread->td_oncpu, pb->pb_cpu); thread_lock(curthread); if (pb->pb_bound) sched_bind(curthread, pb->pb_cpu); else sched_unbind(curthread); thread_unlock(curthread); PMCDBG(CPU,BND,2, "%s", "restore-cpu done"); } /* * move execution over the specified cpu and bind it there. */ static void pmc_select_cpu(int cpu) { KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[pmc,%d] bad cpu number %d", __LINE__, cpu)); /* never move to a disabled CPU */ KASSERT(pmc_cpu_is_disabled(cpu) == 0, ("[pmc,%d] selecting " "disabled CPU %d", __LINE__, cpu)); PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); KASSERT(curthread->td_oncpu == cpu, ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__, cpu, curthread->td_oncpu)); PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu); } /* * Force a context switch. * * We do this by pause'ing for 1 tick -- invoking mi_switch() is not * guaranteed to force a context switch. */ static void pmc_force_context_switch(void) { pause("pmcctx", 1); } /* * Get the file name for an executable. This is a simple wrapper * around vn_fullpath(9). */ static void pmc_getfilename(struct vnode *v, char **fullpath, char **freepath) { struct thread *td; td = curthread; *fullpath = "unknown"; *freepath = NULL; - vn_lock(v, LK_CANRECURSE | LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(v, LK_CANRECURSE | LK_EXCLUSIVE | LK_RETRY); vn_fullpath(td, v, fullpath, freepath); VOP_UNLOCK(v, 0, td); } /* * remove an process owning PMCs */ void pmc_remove_owner(struct pmc_owner *po) { struct pmc *pm, *tmp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG(OWN,ORM,1, "remove-owner po=%p", po); /* Remove descriptor from the owner hash table */ LIST_REMOVE(po, po_next); /* release all owned PMC descriptors */ LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) { PMCDBG(OWN,ORM,2, "pmc=%p", pm); KASSERT(pm->pm_owner == po, ("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po)); pmc_release_pmc_descriptor(pm); /* will unlink from the list */ } KASSERT(po->po_sscount == 0, ("[pmc,%d] SS count not zero", __LINE__)); KASSERT(LIST_EMPTY(&po->po_pmcs), ("[pmc,%d] PMC list not empty", __LINE__)); /* de-configure the log file if present */ if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_deconfigure_log(po); } /* * remove an owner process record if all conditions are met. */ static void pmc_maybe_remove_owner(struct pmc_owner *po) { PMCDBG(OWN,OMR,1, "maybe-remove-owner po=%p", po); /* * Remove owner record if * - this process does not own any PMCs * - this process has not allocated a system-wide sampling buffer */ if (LIST_EMPTY(&po->po_pmcs) && ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } } /* * Add an association between a target process and a PMC. */ static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp) { int ri; struct pmc_target *pt; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL && pp != NULL, ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp)); KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d", __LINE__, pm, pp->pp_proc->p_pid)); KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < ((int) md->pmd_npmc - 1), ("[pmc,%d] Illegal reference count %d for process record %p", __LINE__, pp->pp_refcnt, (void *) pp)); ri = PMC_TO_ROWINDEX(pm); PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p", pm, ri, pp); #ifdef DEBUG LIST_FOREACH(pt, &pm->pm_targets, pt_next) if (pt->pt_process == pp) KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets", __LINE__, pp, pm)); #endif MALLOC(pt, struct pmc_target *, sizeof(struct pmc_target), M_PMC, M_ZERO|M_WAITOK); pt->pt_process = pp; LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next); atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc, (uintptr_t)pm); if (pm->pm_owner->po_owner == pp->pp_proc) pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER; /* * Initialize the per-process values at this row index. */ pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ? pm->pm_sc.pm_reloadcount : 0; pp->pp_refcnt++; } /* * Removes the association between a target process and a PMC. */ static void pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp) { int ri; struct proc *p; struct pmc_target *ptgt; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL && pp != NULL, ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp)); KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt < (int) md->pmd_npmc, ("[pmc,%d] Illegal ref count %d on process record %p", __LINE__, pp->pp_refcnt, (void *) pp)); ri = PMC_TO_ROWINDEX(pm); PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p", pm, ri, pp); KASSERT(pp->pp_pmcs[ri].pp_pmc == pm, ("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__, ri, pm, pp->pp_pmcs[ri].pp_pmc)); pp->pp_pmcs[ri].pp_pmc = NULL; pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0; /* Remove owner-specific flags */ if (pm->pm_owner->po_owner == pp->pp_proc) { pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS; pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER; } pp->pp_refcnt--; /* Remove the target process from the PMC structure */ LIST_FOREACH(ptgt, &pm->pm_targets, pt_next) if (ptgt->pt_process == pp) break; KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found " "in pmc %p", __LINE__, pp->pp_proc, pp, pm)); LIST_REMOVE(ptgt, pt_next); FREE(ptgt, M_PMC); /* if the PMC now lacks targets, send the owner a SIGIO */ if (LIST_EMPTY(&pm->pm_targets)) { p = pm->pm_owner->po_owner; PROC_LOCK(p); psignal(p, SIGIO); PROC_UNLOCK(p); PMCDBG(PRC,SIG,2, "signalling proc=%p signal=%d", p, SIGIO); } } /* * Check if PMC 'pm' may be attached to target process 't'. */ static int pmc_can_attach(struct pmc *pm, struct proc *t) { struct proc *o; /* pmc owner */ struct ucred *oc, *tc; /* owner, target credentials */ int decline_attach, i; /* * A PMC's owner can always attach that PMC to itself. */ if ((o = pm->pm_owner->po_owner) == t) return 0; PROC_LOCK(o); oc = o->p_ucred; crhold(oc); PROC_UNLOCK(o); PROC_LOCK(t); tc = t->p_ucred; crhold(tc); PROC_UNLOCK(t); /* * The effective uid of the PMC owner should match at least one * of the {effective,real,saved} uids of the target process. */ decline_attach = oc->cr_uid != tc->cr_uid && oc->cr_uid != tc->cr_svuid && oc->cr_uid != tc->cr_ruid; /* * Every one of the target's group ids, must be in the owner's * group list. */ for (i = 0; !decline_attach && i < tc->cr_ngroups; i++) decline_attach = !groupmember(tc->cr_groups[i], oc); /* check the read and saved gids too */ if (decline_attach == 0) decline_attach = !groupmember(tc->cr_rgid, oc) || !groupmember(tc->cr_svgid, oc); crfree(tc); crfree(oc); return !decline_attach; } /* * Attach a process to a PMC. */ static int pmc_attach_one_process(struct proc *p, struct pmc *pm) { int ri; char *fullpath, *freepath; struct pmc_process *pp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); /* * Locate the process descriptor corresponding to process 'p', * allocating space as needed. * * Verify that rowindex 'pm_rowindex' is free in the process * descriptor. * * If not, allocate space for a descriptor and link the * process descriptor and PMC. */ ri = PMC_TO_ROWINDEX(pm); if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL) return ENOMEM; if (pp->pp_pmcs[ri].pp_pmc == pm) /* already present at slot [ri] */ return EEXIST; if (pp->pp_pmcs[ri].pp_pmc != NULL) return EBUSY; pmc_link_target_process(pm, pp); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) && (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0) pm->pm_flags |= PMC_F_NEEDS_LOGFILE; pm->pm_flags |= PMC_F_ATTACH_DONE; /* mark as attached */ /* issue an attach event to a configured log file */ if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) { pmc_getfilename(p->p_textvp, &fullpath, &freepath); pmclog_process_pmcattach(pm, p->p_pid, fullpath); if (freepath) FREE(freepath, M_TEMP); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmc_log_process_mappings(pm->pm_owner, p); } /* mark process as using HWPMCs */ PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); return 0; } /* * Attach a process and optionally its children */ static int pmc_attach_process(struct proc *p, struct pmc *pm) { int error; struct proc *top; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); /* * If this PMC successfully allowed a GETMSR operation * in the past, disallow further ATTACHes. */ if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0) return EPERM; if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0) return pmc_attach_one_process(p, pm); /* * Traverse all child processes, attaching them to * this PMC. */ sx_slock(&proctree_lock); top = p; for (;;) { if ((error = pmc_attach_one_process(p, pm)) != 0) break; if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } if (error) (void) pmc_detach_process(top, pm); done: sx_sunlock(&proctree_lock); return error; } /* * Detach a process from a PMC. If there are no other PMCs tracking * this process, remove the process structure from its hash table. If * 'flags' contains PMC_FLAG_REMOVE, then free the process structure. */ static int pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags) { int ri; struct pmc_process *pp; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL, ("[pmc,%d] null pm pointer", __LINE__)); ri = PMC_TO_ROWINDEX(pm); PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x", pm, ri, p, p->p_pid, p->p_comm, flags); if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) return ESRCH; if (pp->pp_pmcs[ri].pp_pmc != pm) return EINVAL; pmc_unlink_target_process(pm, pp); /* Issue a detach entry if a log file is configured */ if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_pmcdetach(pm, p->p_pid); /* * If there are no PMCs targetting this process, we remove its * descriptor from the target hash table and unset the P_HWPMC * flag in the struct proc. */ KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc, ("[pmc,%d] Illegal refcnt %d for process struct %p", __LINE__, pp->pp_refcnt, pp)); if (pp->pp_refcnt != 0) /* still a target of some PMC */ return 0; pmc_remove_process_descriptor(pp); if (flags & PMC_FLAG_REMOVE) FREE(pp, M_PMC); PROC_LOCK(p); p->p_flag &= ~P_HWPMC; PROC_UNLOCK(p); return 0; } /* * Detach a process and optionally its descendants from a PMC. */ static int pmc_detach_process(struct proc *p, struct pmc *pm) { struct proc *top; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0) return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE); /* * Traverse all children, detaching them from this PMC. We * ignore errors since we could be detaching a PMC from a * partially attached proc tree. */ sx_slock(&proctree_lock); top = p; for (;;) { (void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE); if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } done: sx_sunlock(&proctree_lock); if (LIST_EMPTY(&pm->pm_targets)) pm->pm_flags &= ~PMC_F_ATTACH_DONE; return 0; } /* * Thread context switch IN */ static void pmc_process_csw_in(struct thread *td) { int cpu; unsigned int ri; struct pmc *pm; struct proc *p; struct pmc_cpu *pc; struct pmc_hw *phw; struct pmc_process *pp; pmc_value_t newvalue; p = td->td_proc; if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL) return; KASSERT(pp->pp_proc == td->td_proc, ("[pmc,%d] not my thread state", __LINE__)); critical_enter(); /* no preemption from this point */ cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */ PMCDBG(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p, p->p_pid, p->p_comm, pp); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[pmc,%d] wierd CPU id %d", __LINE__, cpu)); pc = pmc_pcpu[cpu]; for (ri = 0; ri < md->pmd_npmc; ri++) { if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL) continue; KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] Target PMC in non-virtual mode (%d)", __LINE__, PMC_TO_MODE(pm))); KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] Row index mismatch pmc %d != ri %d", __LINE__, PMC_TO_ROWINDEX(pm), ri)); /* * Only PMCs that are marked as 'RUNNING' need * be placed on hardware. */ if (pm->pm_state != PMC_STATE_RUNNING) continue; /* increment PMC runcount */ atomic_add_rel_32(&pm->pm_runcount, 1); /* configure the HWPMC we are going to use. */ md->pmd_config_pmc(cpu, ri, pm); phw = pc->pc_hwpmcs[ri]; KASSERT(phw != NULL, ("[pmc,%d] null hw pointer", __LINE__)); KASSERT(phw->phw_pmc == pm, ("[pmc,%d] hw->pmc %p != pmc %p", __LINE__, phw->phw_pmc, pm)); /* * Write out saved value and start the PMC. * * Sampling PMCs use a per-process value, while * counting mode PMCs use a per-pmc value that is * inherited across descendants. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { mtx_pool_lock_spin(pmc_mtxpool, pm); newvalue = PMC_PCPU_SAVED(cpu,ri) = pp->pp_pmcs[ri].pp_pmcval; mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC, ("[pmc,%d] illegal mode=%d", __LINE__, PMC_TO_MODE(pm))); mtx_pool_lock_spin(pmc_mtxpool, pm); newvalue = PMC_PCPU_SAVED(cpu, ri) = pm->pm_gv.pm_savedvalue; mtx_pool_unlock_spin(pmc_mtxpool, pm); } PMCDBG(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue); md->pmd_write_pmc(cpu, ri, newvalue); md->pmd_start_pmc(cpu, ri); } /* * perform any other architecture/cpu dependent thread * switch-in actions. */ (void) (*md->pmd_switch_in)(pc, pp); critical_exit(); } /* * Thread context switch OUT. */ static void pmc_process_csw_out(struct thread *td) { int cpu; enum pmc_mode mode; unsigned int ri; struct pmc *pm; struct proc *p; struct pmc_cpu *pc; struct pmc_process *pp; int64_t tmp; pmc_value_t newvalue; /* * Locate our process descriptor; this may be NULL if * this process is exiting and we have already removed * the process from the target process table. * * Note that due to kernel preemption, multiple * context switches may happen while the process is * exiting. * * Note also that if the target process cannot be * found we still need to deconfigure any PMCs that * are currently running on hardware. */ p = td->td_proc; pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE); /* * save PMCs */ critical_enter(); cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */ PMCDBG(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p, p->p_pid, p->p_comm, pp); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[pmc,%d wierd CPU id %d", __LINE__, cpu)); pc = pmc_pcpu[cpu]; /* * When a PMC gets unlinked from a target PMC, it will * be removed from the target's pp_pmc[] array. * * However, on a MP system, the target could have been * executing on another CPU at the time of the unlink. * So, at context switch OUT time, we need to look at * the hardware to determine if a PMC is scheduled on * it. */ for (ri = 0; ri < md->pmd_npmc; ri++) { pm = NULL; (void) (*md->pmd_get_config)(cpu, ri, &pm); if (pm == NULL) /* nothing at this row index */ continue; mode = PMC_TO_MODE(pm); if (!PMC_IS_VIRTUAL_MODE(mode)) continue; /* not a process virtual PMC */ KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); /* Stop hardware if not already stopped */ if (pm->pm_stalled == 0) md->pmd_stop_pmc(cpu, ri); /* reduce this PMC's runcount */ atomic_subtract_rel_32(&pm->pm_runcount, 1); /* * If this PMC is associated with this process, * save the reading. */ if (pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) { KASSERT(pm == pp->pp_pmcs[ri].pp_pmc, ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc)); KASSERT(pp->pp_refcnt > 0, ("[pmc,%d] pp refcnt = %d", __LINE__, pp->pp_refcnt)); md->pmd_read_pmc(cpu, ri, &newvalue); tmp = newvalue - PMC_PCPU_SAVED(cpu,ri); PMCDBG(CSW,SWI,1,"cpu=%d ri=%d tmp=%jd", cpu, ri, tmp); if (mode == PMC_MODE_TS) { /* * For sampling process-virtual PMCs, * we expect the count to be * decreasing as the 'value' * programmed into the PMC is the * number of events to be seen till * the next sampling interrupt. */ if (tmp < 0) tmp += pm->pm_sc.pm_reloadcount; mtx_pool_lock_spin(pmc_mtxpool, pm); pp->pp_pmcs[ri].pp_pmcval -= tmp; if ((int64_t) pp->pp_pmcs[ri].pp_pmcval < 0) pp->pp_pmcs[ri].pp_pmcval += pm->pm_sc.pm_reloadcount; mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { /* * For counting process-virtual PMCs, * we expect the count to be * increasing monotonically, modulo a 64 * bit wraparound. */ KASSERT((int64_t) tmp >= 0, ("[pmc,%d] negative increment cpu=%d " "ri=%d newvalue=%jx saved=%jx " "incr=%jx", __LINE__, cpu, ri, newvalue, PMC_PCPU_SAVED(cpu,ri), tmp)); mtx_pool_lock_spin(pmc_mtxpool, pm); pm->pm_gv.pm_savedvalue += tmp; pp->pp_pmcs[ri].pp_pmcval += tmp; mtx_pool_unlock_spin(pmc_mtxpool, pm); if (pm->pm_flags & PMC_F_LOG_PROCCSW) pmclog_process_proccsw(pm, pp, tmp); } } /* mark hardware as free */ md->pmd_config_pmc(cpu, ri, NULL); } /* * perform any other architecture/cpu dependent thread * switch out functions. */ (void) (*md->pmd_switch_out)(pc, pp); critical_exit(); } /* * Log a KLD operation. */ static void pmc_process_kld_load(struct pmckern_map_in *pkm) { struct pmc_owner *po; sx_assert(&pmc_sx, SX_LOCKED); /* * Notify owners of system sampling PMCs about KLD operations. */ LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, (pid_t) -1, pkm->pm_address, (char *) pkm->pm_file); /* * TODO: Notify owners of (all) process-sampling PMCs too. */ return; } static void pmc_process_kld_unload(struct pmckern_map_out *pkm) { struct pmc_owner *po; sx_assert(&pmc_sx, SX_LOCKED); LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, (pid_t) -1, pkm->pm_address, pkm->pm_address + pkm->pm_size); /* * TODO: Notify owners of process-sampling PMCs. */ } /* * A mapping change for a process. */ static void pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm) { int ri; pid_t pid; char *fullpath, *freepath; const struct pmc *pm; struct pmc_owner *po; const struct pmc_process *pp; freepath = fullpath = NULL; pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath); pid = td->td_proc->p_pid; /* Inform owners of all system-wide sampling PMCs. */ LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, pid, pkm->pm_address, fullpath); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) goto done; /* * Inform sampling PMC owners tracking this process. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL && PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmclog_process_map_in(pm->pm_owner, pid, pkm->pm_address, fullpath); done: if (freepath) FREE(freepath, M_TEMP); } /* * Log an munmap request. */ static void pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm) { int ri; pid_t pid; struct pmc_owner *po; const struct pmc *pm; const struct pmc_process *pp; pid = td->td_proc->p_pid; LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) return; for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL && PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmclog_process_map_out(pm->pm_owner, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); } /* * Log mapping information about the kernel. */ static void pmc_log_kernel_mappings(struct pmc *pm) { struct pmc_owner *po; struct pmckern_map_in *km, *kmbase; sx_assert(&pmc_sx, SX_LOCKED); KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] non-sampling PMC (%p) desires mapping information", __LINE__, (void *) pm)); po = pm->pm_owner; if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE) return; /* * Log the current set of kernel modules. */ kmbase = linker_hwpmc_list_objects(); for (km = kmbase; km->pm_file != NULL; km++) { PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file, (void *) km->pm_address); pmclog_process_map_in(po, (pid_t) -1, km->pm_address, km->pm_file); } FREE(kmbase, M_LINKER); po->po_flags |= PMC_PO_INITIAL_MAPPINGS_DONE; } /* * Log the mappings for a single process. */ static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p) { } /* * Log mappings for all processes in the system. */ static void pmc_log_all_process_mappings(struct pmc_owner *po) { struct proc *p, *top; sx_assert(&pmc_sx, SX_XLOCKED); if ((p = pfind(1)) == NULL) panic("[pmc,%d] Cannot find init", __LINE__); PROC_UNLOCK(p); sx_slock(&proctree_lock); top = p; for (;;) { pmc_log_process_mappings(po, p); if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } done: sx_sunlock(&proctree_lock); } /* * The 'hook' invoked from the kernel proper */ #ifdef DEBUG const char *pmc_hooknames[] = { /* these strings correspond to PMC_FN_* in */ "", "EXEC", "CSW-IN", "CSW-OUT", "SAMPLE", "KLDLOAD", "KLDUNLOAD", "MMAP", "MUNMAP", "CALLCHAIN" }; #endif static int pmc_hook_handler(struct thread *td, int function, void *arg) { PMCDBG(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function, pmc_hooknames[function], arg); switch (function) { /* * Process exec() */ case PMC_FN_PROCESS_EXEC: { char *fullpath, *freepath; unsigned int ri; int is_using_hwpmcs; struct pmc *pm; struct proc *p; struct pmc_owner *po; struct pmc_process *pp; struct pmckern_procexec *pk; sx_assert(&pmc_sx, SX_XLOCKED); p = td->td_proc; pmc_getfilename(p->p_textvp, &fullpath, &freepath); pk = (struct pmckern_procexec *) arg; /* Inform owners of SS mode PMCs of the exec event. */ LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, PMC_ID_INVALID, p->p_pid, pk->pm_entryaddr, fullpath); PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; PROC_UNLOCK(p); if (!is_using_hwpmcs) { if (freepath) FREE(freepath, M_TEMP); break; } /* * PMCs are not inherited across an exec(): remove any * PMCs that this process is the owner of. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } /* * If the process being exec'ed is not the target of any * PMC, we are done. */ if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) { if (freepath) FREE(freepath, M_TEMP); break; } /* * Log the exec event to all monitoring owners. Skip * owners who have already recieved the event because * they had system sampling PMCs active. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) { po = pm->pm_owner; if (po->po_sscount == 0 && po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, pm->pm_id, p->p_pid, pk->pm_entryaddr, fullpath); } if (freepath) FREE(freepath, M_TEMP); PMCDBG(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d", p, p->p_pid, p->p_comm, pk->pm_credentialschanged); if (pk->pm_credentialschanged == 0) /* no change */ break; /* * If the newly exec()'ed process has a different credential * than before, allow it to be the target of a PMC only if * the PMC's owner has sufficient priviledge. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) if (pmc_can_attach(pm, td->td_proc) != 0) pmc_detach_one_process(td->td_proc, pm, PMC_FLAG_NONE); KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc, ("[pmc,%d] Illegal ref count %d on pp %p", __LINE__, pp->pp_refcnt, pp)); /* * If this process is no longer the target of any * PMCs, we can remove the process entry and free * up space. */ if (pp->pp_refcnt == 0) { pmc_remove_process_descriptor(pp); FREE(pp, M_PMC); break; } } break; case PMC_FN_CSW_IN: pmc_process_csw_in(td); break; case PMC_FN_CSW_OUT: pmc_process_csw_out(td); break; /* * Process accumulated PC samples. * * This function is expected to be called by hardclock() for * each CPU that has accumulated PC samples. * * This function is to be executed on the CPU whose samples * are being processed. */ case PMC_FN_DO_SAMPLES: /* * Clear the cpu specific bit in the CPU mask before * do the rest of the processing. If the NMI handler * gets invoked after the "atomic_clear_int()" call * below but before "pmc_process_samples()" gets * around to processing the interrupt, then we will * come back here at the next hardclock() tick (and * may find nothing to do if "pmc_process_samples()" * had already processed the interrupt). We don't * lose the interrupt sample. */ atomic_clear_int(&pmc_cpumask, (1 << PCPU_GET(cpuid))); pmc_process_samples(PCPU_GET(cpuid)); break; case PMC_FN_KLD_LOAD: sx_assert(&pmc_sx, SX_LOCKED); pmc_process_kld_load((struct pmckern_map_in *) arg); break; case PMC_FN_KLD_UNLOAD: sx_assert(&pmc_sx, SX_LOCKED); pmc_process_kld_unload((struct pmckern_map_out *) arg); break; case PMC_FN_MMAP: sx_assert(&pmc_sx, SX_LOCKED); pmc_process_mmap(td, (struct pmckern_map_in *) arg); break; case PMC_FN_MUNMAP: sx_assert(&pmc_sx, SX_LOCKED); pmc_process_munmap(td, (struct pmckern_map_out *) arg); break; case PMC_FN_USER_CALLCHAIN: /* * Record a call chain. */ pmc_capture_user_callchain(PCPU_GET(cpuid), (struct trapframe *) arg); break; default: #ifdef DEBUG KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function)); #endif break; } return 0; } /* * allocate a 'struct pmc_owner' descriptor in the owner hash table. */ static struct pmc_owner * pmc_allocate_owner_descriptor(struct proc *p) { uint32_t hindex; struct pmc_owner *po; struct pmc_ownerhash *poh; hindex = PMC_HASH_PTR(p, pmc_ownerhashmask); poh = &pmc_ownerhash[hindex]; /* allocate space for N pointers and one descriptor struct */ MALLOC(po, struct pmc_owner *, sizeof(struct pmc_owner), M_PMC, M_ZERO|M_WAITOK); po->po_sscount = po->po_error = po->po_flags = 0; po->po_file = NULL; po->po_owner = p; po->po_kthread = NULL; LIST_INIT(&po->po_pmcs); LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */ TAILQ_INIT(&po->po_logbuffers); mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc-per-proc", MTX_SPIN); PMCDBG(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p", p, p->p_pid, p->p_comm, po); return po; } static void pmc_destroy_owner_descriptor(struct pmc_owner *po) { PMCDBG(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)", po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm); mtx_destroy(&po->po_mtx); FREE(po, M_PMC); } /* * find the descriptor corresponding to process 'p', adding or removing it * as specified by 'mode'. */ static struct pmc_process * pmc_find_process_descriptor(struct proc *p, uint32_t mode) { uint32_t hindex; struct pmc_process *pp, *ppnew; struct pmc_processhash *pph; hindex = PMC_HASH_PTR(p, pmc_processhashmask); pph = &pmc_processhash[hindex]; ppnew = NULL; /* * Pre-allocate memory in the FIND_ALLOCATE case since we * cannot call malloc(9) once we hold a spin lock. */ if (mode & PMC_FLAG_ALLOCATE) { /* allocate additional space for 'n' pmc pointers */ MALLOC(ppnew, struct pmc_process *, sizeof(struct pmc_process) + md->pmd_npmc * sizeof(struct pmc_targetstate), M_PMC, M_ZERO|M_WAITOK); } mtx_lock_spin(&pmc_processhash_mtx); LIST_FOREACH(pp, pph, pp_next) if (pp->pp_proc == p) break; if ((mode & PMC_FLAG_REMOVE) && pp != NULL) LIST_REMOVE(pp, pp_next); if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL && ppnew != NULL) { ppnew->pp_proc = p; LIST_INSERT_HEAD(pph, ppnew, pp_next); pp = ppnew; ppnew = NULL; } mtx_unlock_spin(&pmc_processhash_mtx); if (pp != NULL && ppnew != NULL) FREE(ppnew, M_PMC); return pp; } /* * remove a process descriptor from the process hash table. */ static void pmc_remove_process_descriptor(struct pmc_process *pp) { KASSERT(pp->pp_refcnt == 0, ("[pmc,%d] Removing process descriptor %p with count %d", __LINE__, pp, pp->pp_refcnt)); mtx_lock_spin(&pmc_processhash_mtx); LIST_REMOVE(pp, pp_next); mtx_unlock_spin(&pmc_processhash_mtx); } /* * find an owner descriptor corresponding to proc 'p' */ static struct pmc_owner * pmc_find_owner_descriptor(struct proc *p) { uint32_t hindex; struct pmc_owner *po; struct pmc_ownerhash *poh; hindex = PMC_HASH_PTR(p, pmc_ownerhashmask); poh = &pmc_ownerhash[hindex]; po = NULL; LIST_FOREACH(po, poh, po_next) if (po->po_owner == p) break; PMCDBG(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> " "pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po); return po; } /* * pmc_allocate_pmc_descriptor * * Allocate a pmc descriptor and initialize its * fields. */ static struct pmc * pmc_allocate_pmc_descriptor(void) { struct pmc *pmc; MALLOC(pmc, struct pmc *, sizeof(struct pmc), M_PMC, M_ZERO|M_WAITOK); if (pmc != NULL) { pmc->pm_owner = NULL; LIST_INIT(&pmc->pm_targets); } PMCDBG(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc); return pmc; } /* * Destroy a pmc descriptor. */ static void pmc_destroy_pmc_descriptor(struct pmc *pm) { (void) pm; #ifdef DEBUG KASSERT(pm->pm_state == PMC_STATE_DELETED || pm->pm_state == PMC_STATE_FREE, ("[pmc,%d] destroying non-deleted PMC", __LINE__)); KASSERT(LIST_EMPTY(&pm->pm_targets), ("[pmc,%d] destroying pmc with targets", __LINE__)); KASSERT(pm->pm_owner == NULL, ("[pmc,%d] destroying pmc attached to an owner", __LINE__)); KASSERT(pm->pm_runcount == 0, ("[pmc,%d] pmc has non-zero run count %d", __LINE__, pm->pm_runcount)); #endif } static void pmc_wait_for_pmc_idle(struct pmc *pm) { #ifdef DEBUG volatile int maxloop; maxloop = 100 * mp_ncpus; #endif /* * Loop (with a forced context switch) till the PMC's runcount * comes down to zero. */ while (atomic_load_acq_32(&pm->pm_runcount) > 0) { #ifdef DEBUG maxloop--; KASSERT(maxloop > 0, ("[pmc,%d] (ri%d, rc%d) waiting too long for " "pmc to be free", __LINE__, PMC_TO_ROWINDEX(pm), pm->pm_runcount)); #endif pmc_force_context_switch(); } } /* * This function does the following things: * * - detaches the PMC from hardware * - unlinks all target threads that were attached to it * - removes the PMC from its owner's list * - destroy's the PMC private mutex * * Once this function completes, the given pmc pointer can be safely * FREE'd by the caller. */ static void pmc_release_pmc_descriptor(struct pmc *pm) { u_int ri, cpu; enum pmc_mode mode; struct pmc_hw *phw; struct pmc_owner *po; struct pmc_process *pp; struct pmc_target *ptgt, *tmp; struct pmc_binding pb; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm, ("[pmc,%d] null pmc", __LINE__)); ri = PMC_TO_ROWINDEX(pm); mode = PMC_TO_MODE(pm); PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri, mode); /* * First, we take the PMC off hardware. */ cpu = 0; if (PMC_IS_SYSTEM_MODE(mode)) { /* * A system mode PMC runs on a specific CPU. Switch * to this CPU and turn hardware off. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); pmc_select_cpu(cpu); /* switch off non-stalled CPUs */ if (pm->pm_state == PMC_STATE_RUNNING && pm->pm_stalled == 0) { phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; KASSERT(phw->phw_pmc == pm, ("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)", __LINE__, ri, phw->phw_pmc, pm)); PMCDBG(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri); critical_enter(); md->pmd_stop_pmc(cpu, ri); critical_exit(); } PMCDBG(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri); critical_enter(); md->pmd_config_pmc(cpu, ri, NULL); critical_exit(); /* adjust the global and process count of SS mode PMCs */ if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) { po = pm->pm_owner; po->po_sscount--; if (po->po_sscount == 0) { atomic_subtract_rel_int(&pmc_ss_count, 1); LIST_REMOVE(po, po_ssnext); } } pm->pm_state = PMC_STATE_DELETED; pmc_restore_cpu_binding(&pb); /* * We could have references to this PMC structure in * the per-cpu sample queues. Wait for the queue to * drain. */ pmc_wait_for_pmc_idle(pm); } else if (PMC_IS_VIRTUAL_MODE(mode)) { /* * A virtual PMC could be running on multiple CPUs at * a given instant. * * By marking its state as DELETED, we ensure that * this PMC is never further scheduled on hardware. * * Then we wait till all CPUs are done with this PMC. */ pm->pm_state = PMC_STATE_DELETED; /* Wait for the PMCs runcount to come to zero. */ pmc_wait_for_pmc_idle(pm); /* * At this point the PMC is off all CPUs and cannot be * freshly scheduled onto a CPU. It is now safe to * unlink all targets from this PMC. If a * process-record's refcount falls to zero, we remove * it from the hash table. The module-wide SX lock * protects us from races. */ LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) { pp = ptgt->pt_process; pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */ PMCDBG(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt); /* * If the target process record shows that no * PMCs are attached to it, reclaim its space. */ if (pp->pp_refcnt == 0) { pmc_remove_process_descriptor(pp); FREE(pp, M_PMC); } } cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */ } /* * Release any MD resources */ (void) md->pmd_release_pmc(cpu, ri, pm); /* * Update row disposition */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) PMC_UNMARK_ROW_STANDALONE(ri); else PMC_UNMARK_ROW_THREAD(ri); /* unlink from the owner's list */ if (pm->pm_owner) { LIST_REMOVE(pm, pm_next); pm->pm_owner = NULL; } pmc_destroy_pmc_descriptor(pm); } /* * Register an owner and a pmc. */ static int pmc_register_owner(struct proc *p, struct pmc *pmc) { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(p)) == NULL) if ((po = pmc_allocate_owner_descriptor(p)) == NULL) return ENOMEM; KASSERT(pmc->pm_owner == NULL, ("[pmc,%d] attempting to own an initialized PMC", __LINE__)); pmc->pm_owner = po; LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next); PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_pmcallocate(pmc); PMCDBG(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p", po, pmc); return 0; } /* * Return the current row disposition: * == 0 => FREE * > 0 => PROCESS MODE * < 0 => SYSTEM MODE */ int pmc_getrowdisp(int ri) { return pmc_pmcdisp[ri]; } /* * Check if a PMC at row index 'ri' can be allocated to the current * process. * * Allocation can fail if: * - the current process is already being profiled by a PMC at index 'ri', * attached to it via OP_PMCATTACH. * - the current process has already allocated a PMC at index 'ri' * via OP_ALLOCATE. */ static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu) { enum pmc_mode mode; struct pmc *pm; struct pmc_owner *po; struct pmc_process *pp; PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d " "cpu=%d", p, p->p_pid, p->p_comm, ri, cpu); /* * We shouldn't have already allocated a process-mode PMC at * row index 'ri'. * * We shouldn't have allocated a system-wide PMC on the same * CPU and same RI. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) LIST_FOREACH(pm, &po->po_pmcs, pm_next) { if (PMC_TO_ROWINDEX(pm) == ri) { mode = PMC_TO_MODE(pm); if (PMC_IS_VIRTUAL_MODE(mode)) return EEXIST; if (PMC_IS_SYSTEM_MODE(mode) && (int) PMC_TO_CPU(pm) == cpu) return EEXIST; } } /* * We also shouldn't be the target of any PMC at this index * since otherwise a PMC_ATTACH to ourselves will fail. */ if ((pp = pmc_find_process_descriptor(p, 0)) != NULL) if (pp->pp_pmcs[ri].pp_pmc) return EEXIST; PMCDBG(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok", p, p->p_pid, p->p_comm, ri); return 0; } /* * Check if a given PMC at row index 'ri' can be currently used in * mode 'mode'. */ static int pmc_can_allocate_row(int ri, enum pmc_mode mode) { enum pmc_disp disp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode); if (PMC_IS_SYSTEM_MODE(mode)) disp = PMC_DISP_STANDALONE; else disp = PMC_DISP_THREAD; /* * check disposition for PMC row 'ri': * * Expected disposition Row-disposition Result * * STANDALONE STANDALONE or FREE proceed * STANDALONE THREAD fail * THREAD THREAD or FREE proceed * THREAD STANDALONE fail */ if (!PMC_ROW_DISP_IS_FREE(ri) && !(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) && !(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri))) return EBUSY; /* * All OK */ PMCDBG(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode); return 0; } /* * Find a PMC descriptor with user handle 'pmcid' for thread 'td'. */ static struct pmc * pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid) { struct pmc *pm; KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc, ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__, PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc)); LIST_FOREACH(pm, &po->po_pmcs, pm_next) if (pm->pm_id == pmcid) return pm; return NULL; } static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc) { struct pmc *pm; struct pmc_owner *po; PMCDBG(PMC,FND,1, "find-pmc id=%d", pmcid); if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL) return ESRCH; if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL) return EINVAL; PMCDBG(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm); *pmc = pm; return 0; } /* * Start a PMC. */ static int pmc_start(struct pmc *pm) { int error, cpu, ri; enum pmc_mode mode; struct pmc_owner *po; struct pmc_binding pb; KASSERT(pm != NULL, ("[pmc,%d] null pm", __LINE__)); mode = PMC_TO_MODE(pm); ri = PMC_TO_ROWINDEX(pm); error = 0; PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri); po = pm->pm_owner; /* * Disallow PMCSTART if a logfile is required but has not been * configured yet. */ if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) && (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return EDOOFUS; /* programming error */ /* * If this is a sampling mode PMC, log mapping information for * the kernel modules that are currently loaded. */ if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmc_log_kernel_mappings(pm); if (PMC_IS_VIRTUAL_MODE(mode)) { /* * If a PMCATTACH has never been done on this PMC, * attach it to its owner process. */ if (LIST_EMPTY(&pm->pm_targets)) error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH : pmc_attach_process(po->po_owner, pm); /* * If the PMC is attached to its owner, then force a context * switch to ensure that the MD state gets set correctly. */ if (error == 0) { pm->pm_state = PMC_STATE_RUNNING; if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) pmc_force_context_switch(); } return error; } /* * A system-wide PMC. * * Add the owner to the global list if this is a system-wide * sampling PMC. */ if (mode == PMC_MODE_SS) { if (po->po_sscount == 0) { LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext); atomic_add_rel_int(&pmc_ss_count, 1); PMCDBG(PMC,OPS,1, "po=%p in global list", po); } po->po_sscount++; } /* Log mapping information for all processes in the system. */ pmc_log_all_process_mappings(po); /* * Move to the CPU associated with this * PMC, and start the hardware. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); if (pmc_cpu_is_disabled(cpu)) return ENXIO; pmc_select_cpu(cpu); /* * global PMCs are configured at allocation time * so write out the initial value and start the PMC. */ pm->pm_state = PMC_STATE_RUNNING; critical_enter(); if ((error = md->pmd_write_pmc(cpu, ri, PMC_IS_SAMPLING_MODE(mode) ? pm->pm_sc.pm_reloadcount : pm->pm_sc.pm_initial)) == 0) error = md->pmd_start_pmc(cpu, ri); critical_exit(); pmc_restore_cpu_binding(&pb); return error; } /* * Stop a PMC. */ static int pmc_stop(struct pmc *pm) { int cpu, error, ri; struct pmc_owner *po; struct pmc_binding pb; KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__)); PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm, PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm)); pm->pm_state = PMC_STATE_STOPPED; /* * If the PMC is a virtual mode one, changing the state to * non-RUNNING is enough to ensure that the PMC never gets * scheduled. * * If this PMC is current running on a CPU, then it will * handled correctly at the time its target process is context * switched out. */ if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) return 0; /* * A system-mode PMC. Move to the CPU associated with * this PMC, and stop the hardware. We update the * 'initial count' so that a subsequent PMCSTART will * resume counting from the current hardware count. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[pmc,%d] illegal cpu=%d", __LINE__, cpu)); if (pmc_cpu_is_disabled(cpu)) return ENXIO; pmc_select_cpu(cpu); ri = PMC_TO_ROWINDEX(pm); critical_enter(); if ((error = md->pmd_stop_pmc(cpu, ri)) == 0) error = md->pmd_read_pmc(cpu, ri, &pm->pm_sc.pm_initial); critical_exit(); pmc_restore_cpu_binding(&pb); po = pm->pm_owner; /* remove this owner from the global list of SS PMC owners */ if (PMC_TO_MODE(pm) == PMC_MODE_SS) { po->po_sscount--; if (po->po_sscount == 0) { atomic_subtract_rel_int(&pmc_ss_count, 1); LIST_REMOVE(po, po_ssnext); PMCDBG(PMC,OPS,2,"po=%p removed from global list", po); } } return error; } #ifdef DEBUG static const char *pmc_op_to_name[] = { #undef __PMC_OP #define __PMC_OP(N, D) #N , __PMC_OPS() NULL }; #endif /* * The syscall interface */ #define PMC_GET_SX_XLOCK(...) do { \ sx_xlock(&pmc_sx); \ if (pmc_hook == NULL) { \ sx_xunlock(&pmc_sx); \ return __VA_ARGS__; \ } \ } while (0) #define PMC_DOWNGRADE_SX() do { \ sx_downgrade(&pmc_sx); \ is_sx_downgraded = 1; \ } while (0) static int pmc_syscall_handler(struct thread *td, void *syscall_args) { int error, is_sx_downgraded, op; struct pmc_syscall_args *c; void *arg; PMC_GET_SX_XLOCK(ENOSYS); DROP_GIANT(); is_sx_downgraded = 0; c = (struct pmc_syscall_args *) syscall_args; op = c->pmop_code; arg = c->pmop_data; PMCDBG(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op, pmc_op_to_name[op], arg); error = 0; atomic_add_int(&pmc_stats.pm_syscalls, 1); switch(op) { /* * Configure a log file. * * XXX This OP will be reworked. */ case PMC_OP_CONFIGURELOG: { struct proc *p; struct pmc *pm; struct pmc_owner *po; struct pmc_op_configurelog cl; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &cl, sizeof(cl))) != 0) break; /* mark this process as owning a log file */ p = td->td_proc; if ((po = pmc_find_owner_descriptor(p)) == NULL) if ((po = pmc_allocate_owner_descriptor(p)) == NULL) { error = ENOMEM; break; } /* * If a valid fd was passed in, try to configure that, * otherwise if 'fd' was less than zero and there was * a log file configured, flush its buffers and * de-configure it. */ if (cl.pm_logfd >= 0) error = pmclog_configure_log(po, cl.pm_logfd); else if (po->po_flags & PMC_PO_OWNS_LOGFILE) { pmclog_process_closelog(po); error = pmclog_flush(po); if (error == 0) { LIST_FOREACH(pm, &po->po_pmcs, pm_next) if (pm->pm_flags & PMC_F_NEEDS_LOGFILE && pm->pm_state == PMC_STATE_RUNNING) pmc_stop(pm); error = pmclog_deconfigure_log(po); } } else error = EINVAL; if (error) break; } break; /* * Flush a log file. */ case PMC_OP_FLUSHLOG: { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } error = pmclog_flush(po); } break; /* * Retrieve hardware configuration. */ case PMC_OP_GETCPUINFO: /* CPU information */ { struct pmc_op_getcpuinfo gci; gci.pm_cputype = md->pmd_cputype; gci.pm_ncpu = mp_ncpus; gci.pm_npmc = md->pmd_npmc; gci.pm_nclass = md->pmd_nclass; bcopy(md->pmd_classes, &gci.pm_classes, sizeof(gci.pm_classes)); error = copyout(&gci, arg, sizeof(gci)); } break; /* * Get module statistics */ case PMC_OP_GETDRIVERSTATS: { struct pmc_op_getdriverstats gms; bcopy(&pmc_stats, &gms, sizeof(gms)); error = copyout(&gms, arg, sizeof(gms)); } break; /* * Retrieve module version number */ case PMC_OP_GETMODULEVERSION: { uint32_t cv, modv; /* retrieve the client's idea of the ABI version */ if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0) break; /* don't service clients newer than our driver */ modv = PMC_VERSION; if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) { error = EPROGMISMATCH; break; } error = copyout(&modv, arg, sizeof(int)); } break; /* * Retrieve the state of all the PMCs on a given * CPU. */ case PMC_OP_GETPMCINFO: { uint32_t cpu, n, npmc; size_t pmcinfo_size; struct pmc *pm; struct pmc_info *p, *pmcinfo; struct pmc_op_getpmcinfo *gpi; struct pmc_owner *po; struct pmc_binding pb; PMC_DOWNGRADE_SX(); gpi = (struct pmc_op_getpmcinfo *) arg; if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0) break; if (cpu >= (unsigned int) mp_ncpus) { error = EINVAL; break; } if (pmc_cpu_is_disabled(cpu)) { error = ENXIO; break; } /* switch to CPU 'cpu' */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); npmc = md->pmd_npmc; pmcinfo_size = npmc * sizeof(struct pmc_info); MALLOC(pmcinfo, struct pmc_info *, pmcinfo_size, M_PMC, M_WAITOK); p = pmcinfo; for (n = 0; n < md->pmd_npmc; n++, p++) { if ((error = md->pmd_describe(cpu, n, p, &pm)) != 0) break; if (PMC_ROW_DISP_IS_STANDALONE(n)) p->pm_rowdisp = PMC_DISP_STANDALONE; else if (PMC_ROW_DISP_IS_THREAD(n)) p->pm_rowdisp = PMC_DISP_THREAD; else p->pm_rowdisp = PMC_DISP_FREE; p->pm_ownerpid = -1; if (pm == NULL) /* no PMC associated */ continue; po = pm->pm_owner; KASSERT(po->po_owner != NULL, ("[pmc,%d] pmc_owner had a null proc pointer", __LINE__)); p->pm_ownerpid = po->po_owner->p_pid; p->pm_mode = PMC_TO_MODE(pm); p->pm_event = pm->pm_event; p->pm_flags = pm->pm_flags; if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) p->pm_reloadcount = pm->pm_sc.pm_reloadcount; } pmc_restore_cpu_binding(&pb); /* now copy out the PMC info collected */ if (error == 0) error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size); FREE(pmcinfo, M_PMC); } break; /* * Set the administrative state of a PMC. I.e. whether * the PMC is to be used or not. */ case PMC_OP_PMCADMIN: { int cpu, ri; enum pmc_state request; struct pmc_cpu *pc; struct pmc_hw *phw; struct pmc_op_pmcadmin pma; struct pmc_binding pb; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); error = priv_check(td, PRIV_PMC_MANAGE); if (error) break; if ((error = copyin(arg, &pma, sizeof(pma))) != 0) break; cpu = pma.pm_cpu; if (cpu < 0 || cpu >= mp_ncpus) { error = EINVAL; break; } if (pmc_cpu_is_disabled(cpu)) { error = ENXIO; break; } request = pma.pm_state; if (request != PMC_STATE_DISABLED && request != PMC_STATE_FREE) { error = EINVAL; break; } ri = pma.pm_pmc; /* pmc id == row index */ if (ri < 0 || ri >= (int) md->pmd_npmc) { error = EINVAL; break; } /* * We can't disable a PMC with a row-index allocated * for process virtual PMCs. */ if (PMC_ROW_DISP_IS_THREAD(ri) && request == PMC_STATE_DISABLED) { error = EBUSY; break; } /* * otherwise, this PMC on this CPU is either free or * in system-wide mode. */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); pc = pmc_pcpu[cpu]; phw = pc->pc_hwpmcs[ri]; /* * XXX do we need some kind of 'forced' disable? */ if (phw->phw_pmc == NULL) { if (request == PMC_STATE_DISABLED && (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) { phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED; PMC_MARK_ROW_STANDALONE(ri); } else if (request == PMC_STATE_FREE && (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) { phw->phw_state |= PMC_PHW_FLAG_IS_ENABLED; PMC_UNMARK_ROW_STANDALONE(ri); } /* other cases are a no-op */ } else error = EBUSY; pmc_restore_cpu_binding(&pb); } break; /* * Allocate a PMC. */ case PMC_OP_PMCALLOCATE: { uint32_t caps; u_int cpu; int n; enum pmc_mode mode; struct pmc *pmc; struct pmc_hw *phw; struct pmc_op_pmcallocate pa; struct pmc_binding pb; if ((error = copyin(arg, &pa, sizeof(pa))) != 0) break; caps = pa.pm_caps; mode = pa.pm_mode; cpu = pa.pm_cpu; if ((mode != PMC_MODE_SS && mode != PMC_MODE_SC && mode != PMC_MODE_TS && mode != PMC_MODE_TC) || (cpu != (u_int) PMC_CPU_ANY && cpu >= (u_int) mp_ncpus)) { error = EINVAL; break; } /* * Virtual PMCs should only ask for a default CPU. * System mode PMCs need to specify a non-default CPU. */ if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) || (PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) { error = EINVAL; break; } /* * Check that a disabled CPU is not being asked for. */ if (PMC_IS_SYSTEM_MODE(mode) && pmc_cpu_is_disabled(cpu)) { error = ENXIO; break; } /* * Refuse an allocation for a system-wide PMC if this * process has been jailed, or if this process lacks * super-user credentials and the sysctl tunable * 'security.bsd.unprivileged_syspmcs' is zero. */ if (PMC_IS_SYSTEM_MODE(mode)) { if (jailed(curthread->td_ucred)) { error = EPERM; break; } if (!pmc_unprivileged_syspmcs) { error = priv_check(curthread, PRIV_PMC_SYSTEM); if (error) break; } } if (error) break; /* * Look for valid values for 'pm_flags' */ if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN)) != 0) { error = EINVAL; break; } /* process logging options are not allowed for system PMCs */ if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags & (PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT))) { error = EINVAL; break; } /* * All sampling mode PMCs need to be able to interrupt the * CPU. */ if (PMC_IS_SAMPLING_MODE(mode)) caps |= PMC_CAP_INTERRUPT; /* A valid class specifier should have been passed in. */ for (n = 0; n < md->pmd_nclass; n++) if (md->pmd_classes[n].pm_class == pa.pm_class) break; if (n == md->pmd_nclass) { error = EINVAL; break; } /* The requested PMC capabilities should be feasible. */ if ((md->pmd_classes[n].pm_caps & caps) != caps) { error = EOPNOTSUPP; break; } PMCDBG(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d", pa.pm_ev, caps, mode, cpu); pmc = pmc_allocate_pmc_descriptor(); pmc->pm_id = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class, PMC_ID_INVALID); pmc->pm_event = pa.pm_ev; pmc->pm_state = PMC_STATE_FREE; pmc->pm_caps = caps; pmc->pm_flags = pa.pm_flags; /* switch thread to CPU 'cpu' */ pmc_save_cpu_binding(&pb); #define PMC_IS_SHAREABLE_PMC(cpu, n) \ (pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state & \ PMC_PHW_FLAG_IS_SHAREABLE) #define PMC_IS_UNALLOCATED(cpu, n) \ (pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL) if (PMC_IS_SYSTEM_MODE(mode)) { pmc_select_cpu(cpu); for (n = 0; n < (int) md->pmd_npmc; n++) if (pmc_can_allocate_row(n, mode) == 0 && pmc_can_allocate_rowindex( curthread->td_proc, n, cpu) == 0 && (PMC_IS_UNALLOCATED(cpu, n) || PMC_IS_SHAREABLE_PMC(cpu, n)) && md->pmd_allocate_pmc(cpu, n, pmc, &pa) == 0) break; } else { /* Process virtual mode */ for (n = 0; n < (int) md->pmd_npmc; n++) { if (pmc_can_allocate_row(n, mode) == 0 && pmc_can_allocate_rowindex( curthread->td_proc, n, PMC_CPU_ANY) == 0 && md->pmd_allocate_pmc(curthread->td_oncpu, n, pmc, &pa) == 0) break; } } #undef PMC_IS_UNALLOCATED #undef PMC_IS_SHAREABLE_PMC pmc_restore_cpu_binding(&pb); if (n == (int) md->pmd_npmc) { pmc_destroy_pmc_descriptor(pmc); FREE(pmc, M_PMC); pmc = NULL; error = EINVAL; break; } /* Fill in the correct value in the ID field */ pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n); PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x", pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id); /* Process mode PMCs with logging enabled need log files */ if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT | PMC_F_LOG_PROCCSW)) pmc->pm_flags |= PMC_F_NEEDS_LOGFILE; /* All system mode sampling PMCs require a log file */ if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode)) pmc->pm_flags |= PMC_F_NEEDS_LOGFILE; /* * Configure global pmc's immediately */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) { pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); phw = pmc_pcpu[cpu]->pc_hwpmcs[n]; if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 || (error = md->pmd_config_pmc(cpu, n, pmc)) != 0) { (void) md->pmd_release_pmc(cpu, n, pmc); pmc_destroy_pmc_descriptor(pmc); FREE(pmc, M_PMC); pmc = NULL; pmc_restore_cpu_binding(&pb); error = EPERM; break; } pmc_restore_cpu_binding(&pb); } pmc->pm_state = PMC_STATE_ALLOCATED; /* * mark row disposition */ if (PMC_IS_SYSTEM_MODE(mode)) PMC_MARK_ROW_STANDALONE(n); else PMC_MARK_ROW_THREAD(n); /* * Register this PMC with the current thread as its owner. */ if ((error = pmc_register_owner(curthread->td_proc, pmc)) != 0) { pmc_release_pmc_descriptor(pmc); FREE(pmc, M_PMC); pmc = NULL; break; } /* * Return the allocated index. */ pa.pm_pmcid = pmc->pm_id; error = copyout(&pa, arg, sizeof(pa)); } break; /* * Attach a PMC to a process. */ case PMC_OP_PMCATTACH: { struct pmc *pm; struct proc *p; struct pmc_op_pmcattach a; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &a, sizeof(a))) != 0) break; if (a.pm_pid < 0) { error = EINVAL; break; } else if (a.pm_pid == 0) a.pm_pid = td->td_proc->p_pid; if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0) break; if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { error = EINVAL; break; } /* PMCs may be (re)attached only when allocated or stopped */ if (pm->pm_state == PMC_STATE_RUNNING) { error = EBUSY; break; } else if (pm->pm_state != PMC_STATE_ALLOCATED && pm->pm_state != PMC_STATE_STOPPED) { error = EINVAL; break; } /* lookup pid */ if ((p = pfind(a.pm_pid)) == NULL) { error = ESRCH; break; } /* * Ignore processes that are working on exiting. */ if (p->p_flag & P_WEXIT) { error = ESRCH; PROC_UNLOCK(p); /* pfind() returns a locked process */ break; } /* * we are allowed to attach a PMC to a process if * we can debug it. */ error = p_candebug(curthread, p); PROC_UNLOCK(p); if (error == 0) error = pmc_attach_process(p, pm); } break; /* * Detach an attached PMC from a process. */ case PMC_OP_PMCDETACH: { struct pmc *pm; struct proc *p; struct pmc_op_pmcattach a; if ((error = copyin(arg, &a, sizeof(a))) != 0) break; if (a.pm_pid < 0) { error = EINVAL; break; } else if (a.pm_pid == 0) a.pm_pid = td->td_proc->p_pid; if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0) break; if ((p = pfind(a.pm_pid)) == NULL) { error = ESRCH; break; } /* * Treat processes that are in the process of exiting * as if they were not present. */ if (p->p_flag & P_WEXIT) error = ESRCH; PROC_UNLOCK(p); /* pfind() returns a locked process */ if (error == 0) error = pmc_detach_process(p, pm); } break; /* * Retrieve the MSR number associated with the counter * 'pmc_id'. This allows processes to directly use RDPMC * instructions to read their PMCs, without the overhead of a * system call. */ case PMC_OP_PMCGETMSR: { int ri; struct pmc *pm; struct pmc_target *pt; struct pmc_op_getmsr gm; PMC_DOWNGRADE_SX(); /* CPU has no 'GETMSR' support */ if (md->pmd_get_msr == NULL) { error = ENOSYS; break; } if ((error = copyin(arg, &gm, sizeof(gm))) != 0) break; if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0) break; /* * The allocated PMC has to be a process virtual PMC, * i.e., of type MODE_T[CS]. Global PMCs can only be * read using the PMCREAD operation since they may be * allocated on a different CPU than the one we could * be running on at the time of the RDPMC instruction. * * The GETMSR operation is not allowed for PMCs that * are inherited across processes. */ if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) || (pm->pm_flags & PMC_F_DESCENDANTS)) { error = EINVAL; break; } /* * It only makes sense to use a RDPMC (or its * equivalent instruction on non-x86 architectures) on * a process that has allocated and attached a PMC to * itself. Conversely the PMC is only allowed to have * one process attached to it -- its owner. */ if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL || LIST_NEXT(pt, pt_next) != NULL || pt->pt_process->pp_proc != pm->pm_owner->po_owner) { error = EINVAL; break; } ri = PMC_TO_ROWINDEX(pm); if ((error = (*md->pmd_get_msr)(ri, &gm.pm_msr)) < 0) break; if ((error = copyout(&gm, arg, sizeof(gm))) < 0) break; /* * Mark our process as using MSRs. Update machine * state using a forced context switch. */ pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS; pmc_force_context_switch(); } break; /* * Release an allocated PMC */ case PMC_OP_PMCRELEASE: { pmc_id_t pmcid; struct pmc *pm; struct pmc_owner *po; struct pmc_op_simple sp; /* * Find PMC pointer for the named PMC. * * Use pmc_release_pmc_descriptor() to switch off the * PMC, remove all its target threads, and remove the * PMC from its owner's list. * * Remove the owner record if this is the last PMC * owned. * * Free up space. */ if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; po = pm->pm_owner; pmc_release_pmc_descriptor(pm); pmc_maybe_remove_owner(po); FREE(pm, M_PMC); } break; /* * Read and/or write a PMC. */ case PMC_OP_PMCRW: { uint32_t cpu, ri; struct pmc *pm; struct pmc_op_pmcrw *pprw; struct pmc_op_pmcrw prw; struct pmc_binding pb; pmc_value_t oldvalue; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &prw, sizeof(prw))) != 0) break; ri = 0; PMCDBG(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid, prw.pm_flags); /* must have at least one flag set */ if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) { error = EINVAL; break; } /* locate pmc descriptor */ if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0) break; /* Can't read a PMC that hasn't been started. */ if (pm->pm_state != PMC_STATE_ALLOCATED && pm->pm_state != PMC_STATE_STOPPED && pm->pm_state != PMC_STATE_RUNNING) { error = EINVAL; break; } /* writing a new value is allowed only for 'STOPPED' pmcs */ if (pm->pm_state == PMC_STATE_RUNNING && (prw.pm_flags & PMC_F_NEWVALUE)) { error = EBUSY; break; } if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) { /* * If this PMC is attached to its owner (i.e., * the process requesting this operation) and * is running, then attempt to get an * upto-date reading from hardware for a READ. * Writes are only allowed when the PMC is * stopped, so only update the saved value * field. * * If the PMC is not running, or is not * attached to its owner, read/write to the * savedvalue field. */ ri = PMC_TO_ROWINDEX(pm); mtx_pool_lock_spin(pmc_mtxpool, pm); cpu = curthread->td_oncpu; if (prw.pm_flags & PMC_F_OLDVALUE) { if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) && (pm->pm_state == PMC_STATE_RUNNING)) error = (*md->pmd_read_pmc)(cpu, ri, &oldvalue); else oldvalue = pm->pm_gv.pm_savedvalue; } if (prw.pm_flags & PMC_F_NEWVALUE) pm->pm_gv.pm_savedvalue = prw.pm_value; mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { /* System mode PMCs */ cpu = PMC_TO_CPU(pm); ri = PMC_TO_ROWINDEX(pm); if (pmc_cpu_is_disabled(cpu)) { error = ENXIO; break; } /* move this thread to CPU 'cpu' */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); critical_enter(); /* save old value */ if (prw.pm_flags & PMC_F_OLDVALUE) if ((error = (*md->pmd_read_pmc)(cpu, ri, &oldvalue))) goto error; /* write out new value */ if (prw.pm_flags & PMC_F_NEWVALUE) error = (*md->pmd_write_pmc)(cpu, ri, prw.pm_value); error: critical_exit(); pmc_restore_cpu_binding(&pb); if (error) break; } pprw = (struct pmc_op_pmcrw *) arg; #ifdef DEBUG if (prw.pm_flags & PMC_F_NEWVALUE) PMCDBG(PMC,OPS,2, "rw id=%d new %jx -> old %jx", ri, prw.pm_value, oldvalue); else if (prw.pm_flags & PMC_F_OLDVALUE) PMCDBG(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue); #endif /* return old value if requested */ if (prw.pm_flags & PMC_F_OLDVALUE) if ((error = copyout(&oldvalue, &pprw->pm_value, sizeof(prw.pm_value)))) break; } break; /* * Set the sampling rate for a sampling mode PMC and the * initial count for a counting mode PMC. */ case PMC_OP_PMCSETCOUNT: { struct pmc *pm; struct pmc_op_pmcsetcount sc; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &sc, sizeof(sc))) != 0) break; if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0) break; if (pm->pm_state == PMC_STATE_RUNNING) { error = EBUSY; break; } if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pm->pm_sc.pm_reloadcount = sc.pm_count; else pm->pm_sc.pm_initial = sc.pm_count; } break; /* * Start a PMC. */ case PMC_OP_PMCSTART: { pmc_id_t pmcid; struct pmc *pm; struct pmc_op_simple sp; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; KASSERT(pmcid == pm->pm_id, ("[pmc,%d] pmcid %x != id %x", __LINE__, pm->pm_id, pmcid)); if (pm->pm_state == PMC_STATE_RUNNING) /* already running */ break; else if (pm->pm_state != PMC_STATE_STOPPED && pm->pm_state != PMC_STATE_ALLOCATED) { error = EINVAL; break; } error = pmc_start(pm); } break; /* * Stop a PMC. */ case PMC_OP_PMCSTOP: { pmc_id_t pmcid; struct pmc *pm; struct pmc_op_simple sp; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; /* * Mark the PMC as inactive and invoke the MD stop * routines if needed. */ if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; KASSERT(pmcid == pm->pm_id, ("[pmc,%d] pmc id %x != pmcid %x", __LINE__, pm->pm_id, pmcid)); if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */ break; else if (pm->pm_state != PMC_STATE_RUNNING) { error = EINVAL; break; } error = pmc_stop(pm); } break; /* * Write a user supplied value to the log file. */ case PMC_OP_WRITELOG: { struct pmc_op_writelog wl; struct pmc_owner *po; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &wl, sizeof(wl))) != 0) break; if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) { error = EINVAL; break; } error = pmclog_process_userlog(po, &wl); } break; default: error = EINVAL; break; } if (is_sx_downgraded) sx_sunlock(&pmc_sx); else sx_xunlock(&pmc_sx); if (error) atomic_add_int(&pmc_stats.pm_syscall_errors, 1); PICKUP_GIANT(); return error; } /* * Helper functions */ /* * Mark the thread as needing callchain capture and post an AST. The * actual callchain capture will be done in a context where it is safe * to take page faults. */ static void pmc_post_callchain_ast(void) { struct thread *td; td = curthread; /* * Mark this thread as needing processing in ast(). * td->td_pflags will be safe to touch as the process was in * user space when it was interrupted. */ td->td_pflags |= TDP_CALLCHAIN; /* * Again, since we've entered this function directly from * userland, `td' is guaranteed to be not locked by this CPU, * so its safe to try acquire the thread lock even though we * are executing in an NMI context. We need to acquire this * lock before touching `td_flags' because other CPUs may be * in the process of touching this field. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); return; } /* * Interrupt processing. * * Find a free slot in the per-cpu array of samples and capture the * current callchain there. If a sample was successfully added, a bit * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook * needs to be invoked from the clock handler. * * This function is meant to be called from an NMI handler. It cannot * use any of the locking primitives supplied by the OS. */ int pmc_process_interrupt(int cpu, struct pmc *pm, struct trapframe *tf, int inuserspace) { int error, callchaindepth; struct thread *td; struct pmc_sample *ps; struct pmc_samplebuffer *psb; error = 0; /* * Allocate space for a sample buffer. */ psb = pmc_pcpu[cpu]->pc_sb; ps = psb->ps_write; if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_stalled = 1; atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1); PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); error = ENOMEM; goto done; } /* Fill in entry. */ PMCDBG(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); atomic_add_rel_32(&pm->pm_runcount, 1); /* hold onto PMC */ ps->ps_pmc = pm; if ((td = curthread) && td->td_proc) ps->ps_pid = td->td_proc->p_pid; else ps->ps_pid = -1; ps->ps_cpu = cpu; ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0; callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ? pmc_callchaindepth : 1; if (callchaindepth == 1) ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf); else { /* * Kernel stack traversals can be done immediately, * while we defer to an AST for user space traversals. */ if (!inuserspace) callchaindepth = pmc_save_kernel_callchain(ps->ps_pc, callchaindepth, tf); else { pmc_post_callchain_ast(); callchaindepth = PMC_SAMPLE_INUSE; } } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ /* increment write pointer, modulo ring buffer size */ ps++; if (ps == psb->ps_fence) psb->ps_write = psb->ps_samples; else psb->ps_write = ps; done: /* mark CPU as needing processing */ atomic_set_rel_int(&pmc_cpumask, (1 << cpu)); return (error); } /* * Capture a user call chain. This function will be called from ast() * before control returns to userland and before the process gets * rescheduled. */ static void pmc_capture_user_callchain(int cpu, struct trapframe *tf) { int i; struct pmc *pm; struct pmc_sample *ps; struct pmc_samplebuffer *psb; psb = pmc_pcpu[cpu]->pc_sb; /* * Iterate through all deferred callchain requests. */ for (i = 0; i < pmc_nsamples; i++) { ps = &psb->ps_samples[i]; if (ps->ps_nsamples != PMC_SAMPLE_INUSE) continue; pm = ps->ps_pmc; KASSERT(pm->pm_flags & PMC_F_CALLCHAIN, ("[pmc,%d] Retrieving callchain for PMC that doesn't " "want it", __LINE__)); /* * Retrieve the callchain and mark the sample buffer * as 'processable' by the timer tick sweep code. */ ps->ps_nsamples = pmc_save_user_callchain(ps->ps_pc, pmc_callchaindepth, tf); } return; } /* * Process saved PC samples. */ static void pmc_process_samples(int cpu) { int n, ri; struct pmc *pm; struct thread *td; struct pmc_owner *po; struct pmc_sample *ps; struct pmc_samplebuffer *psb; KASSERT(PCPU_GET(cpuid) == cpu, ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__, PCPU_GET(cpuid), cpu)); psb = pmc_pcpu[cpu]->pc_sb; for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ ps = psb->ps_read; if (ps->ps_nsamples == PMC_SAMPLE_FREE) break; if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { /* Need a rescan at a later time. */ atomic_set_rel_int(&pmc_cpumask, (1 << cpu)); break; } pm = ps->ps_pmc; po = pm->pm_owner; KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__, pm, PMC_TO_MODE(pm))); /* Ignore PMCs that have been switched off */ if (pm->pm_state != PMC_STATE_RUNNING) goto entrydone; PMCDBG(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu, pm, ps->ps_nsamples, ps->ps_flags, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); /* * If this is a process-mode PMC that is attached to * its owner, and if the PC is in user mode, update * profiling statistics like timer-based profiling * would have done. */ if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { if (ps->ps_flags & PMC_CC_F_USERSPACE) { td = FIRST_THREAD_IN_PROC(po->po_owner); addupc_intr(td, ps->ps_pc[0], 1); } goto entrydone; } /* * Otherwise, this is either a sampling mode PMC that * is attached to a different process than its owner, * or a system-wide sampling PMC. Dispatch a log * entry to the PMC's owner process. */ pmclog_process_callchain(pm, ps); entrydone: ps->ps_nsamples = 0; /* mark entry as free */ atomic_subtract_rel_32(&pm->pm_runcount, 1); /* increment read pointer, modulo sample size */ if (++ps == psb->ps_fence) psb->ps_read = psb->ps_samples; else psb->ps_read = ps; } atomic_add_int(&pmc_stats.pm_log_sweeps, 1); /* Do not re-enable stalled PMCs if we failed to process any samples */ if (n == 0) return; /* * Restart any stalled sampling PMCs on this CPU. * * If the NMI handler sets the pm_stalled field of a PMC after * the check below, we'll end up processing the stalled PMC at * the next hardclock tick. */ for (n = 0; n < md->pmd_npmc; n++) { (void) (*md->pmd_get_config)(cpu,n,&pm); if (pm == NULL || /* !cfg'ed */ pm->pm_state != PMC_STATE_RUNNING || /* !active */ !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */ pm->pm_stalled == 0) /* !stalled */ continue; pm->pm_stalled = 0; ri = PMC_TO_ROWINDEX(pm); (*md->pmd_start_pmc)(cpu, ri); } } /* * Event handlers. */ /* * Handle a process exit. * * Remove this process from all hash tables. If this process * owned any PMCs, turn off those PMCs and deallocate them, * removing any associations with target processes. * * This function will be called by the last 'thread' of a * process. * * XXX This eventhandler gets called early in the exit process. * Consider using a 'hook' invocation from thread_exit() or equivalent * spot. Another negative is that kse_exit doesn't seem to call * exit1() [??]. * */ static void pmc_process_exit(void *arg __unused, struct proc *p) { int is_using_hwpmcs; int cpu; unsigned int ri; struct pmc *pm; struct pmc_process *pp; struct pmc_owner *po; pmc_value_t newvalue, tmp; PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; PROC_UNLOCK(p); /* * Log a sysexit event to all SS PMC owners. */ LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_sysexit(po, p->p_pid); if (!is_using_hwpmcs) return; PMC_GET_SX_XLOCK(); PMCDBG(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid, p->p_comm); /* * Since this code is invoked by the last thread in an exiting * process, we would have context switched IN at some prior * point. However, with PREEMPTION, kernel mode context * switches may happen any time, so we want to disable a * context switch OUT till we get any PMCs targetting this * process off the hardware. * * We also need to atomically remove this process' * entry from our target process hash table, using * PMC_FLAG_REMOVE. */ PMCDBG(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid, p->p_comm); critical_enter(); /* no preemption */ cpu = curthread->td_oncpu; if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_REMOVE)) != NULL) { PMCDBG(PRC,EXT,2, "process-exit proc=%p pmc-process=%p", p, pp); /* * The exiting process could the target of * some PMCs which will be running on * currently executing CPU. * * We need to turn these PMCs off like we * would do at context switch OUT time. */ for (ri = 0; ri < md->pmd_npmc; ri++) { /* * Pick up the pmc pointer from hardware * state similar to the CSW_OUT code. */ pm = NULL; (void) (*md->pmd_get_config)(cpu, ri, &pm); PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm); if (pm == NULL || !PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) continue; PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p " "state=%d", ri, pp->pp_pmcs[ri].pp_pmc, pm, pm->pm_state); KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); KASSERT(pm == pp->pp_pmcs[ri].pp_pmc, ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc)); (void) md->pmd_stop_pmc(cpu, ri); KASSERT(pm->pm_runcount > 0, ("[pmc,%d] bad runcount ri %d rc %d", __LINE__, ri, pm->pm_runcount)); /* Stop hardware only if it is actually running */ if (pm->pm_state == PMC_STATE_RUNNING && pm->pm_stalled == 0) { md->pmd_read_pmc(cpu, ri, &newvalue); tmp = newvalue - PMC_PCPU_SAVED(cpu,ri); mtx_pool_lock_spin(pmc_mtxpool, pm); pm->pm_gv.pm_savedvalue += tmp; pp->pp_pmcs[ri].pp_pmcval += tmp; mtx_pool_unlock_spin(pmc_mtxpool, pm); } atomic_subtract_rel_32(&pm->pm_runcount,1); KASSERT((int) pm->pm_runcount >= 0, ("[pmc,%d] runcount is %d", __LINE__, ri)); (void) md->pmd_config_pmc(cpu, ri, NULL); } /* * Inform the MD layer of this pseudo "context switch * out" */ (void) md->pmd_switch_out(pmc_pcpu[cpu], pp); critical_exit(); /* ok to be pre-empted now */ /* * Unlink this process from the PMCs that are * targetting it. This will send a signal to * all PMC owner's whose PMCs are orphaned. * * Log PMC value at exit time if requested. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) { if (pm->pm_flags & PMC_F_NEEDS_LOGFILE && PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm))) pmclog_process_procexit(pm, pp); pmc_unlink_target_process(pm, pp); } FREE(pp, M_PMC); } else critical_exit(); /* pp == NULL */ /* * If the process owned PMCs, free them up and free up * memory. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } sx_xunlock(&pmc_sx); } /* * Handle a process fork. * * If the parent process 'p1' is under HWPMC monitoring, then copy * over any attached PMCs that have 'do_descendants' semantics. */ static void pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc, int flags) { int is_using_hwpmcs; unsigned int ri; uint32_t do_descendants; struct pmc *pm; struct pmc_owner *po; struct pmc_process *ppnew, *ppold; (void) flags; /* unused parameter */ PROC_LOCK(p1); is_using_hwpmcs = p1->p_flag & P_HWPMC; PROC_UNLOCK(p1); /* * If there are system-wide sampling PMCs active, we need to * log all fork events to their owner's logs. */ LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); if (!is_using_hwpmcs) return; PMC_GET_SX_XLOCK(); PMCDBG(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1, p1->p_pid, p1->p_comm, newproc); /* * If the parent process (curthread->td_proc) is a * target of any PMCs, look for PMCs that are to be * inherited, and link these into the new process * descriptor. */ if ((ppold = pmc_find_process_descriptor(curthread->td_proc, PMC_FLAG_NONE)) == NULL) goto done; /* nothing to do */ do_descendants = 0; for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL) do_descendants |= pm->pm_flags & PMC_F_DESCENDANTS; if (do_descendants == 0) /* nothing to do */ goto done; /* allocate a descriptor for the new process */ if ((ppnew = pmc_find_process_descriptor(newproc, PMC_FLAG_ALLOCATE)) == NULL) goto done; /* * Run through all PMCs that were targeting the old process * and which specified F_DESCENDANTS and attach them to the * new process. * * Log the fork event to all owners of PMCs attached to this * process, if not already logged. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL && (pm->pm_flags & PMC_F_DESCENDANTS)) { pmc_link_target_process(pm, ppnew); po = pm->pm_owner; if (po->po_sscount == 0 && po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); } /* * Now mark the new process as being tracked by this driver. */ PROC_LOCK(newproc); newproc->p_flag |= P_HWPMC; PROC_UNLOCK(newproc); done: sx_xunlock(&pmc_sx); } /* * initialization */ static const char *pmc_name_of_pmcclass[] = { #undef __PMC_CLASS #define __PMC_CLASS(N) #N , __PMC_CLASSES() }; static int pmc_initialize(void) { int cpu, error, n; struct pmc_binding pb; struct pmc_sample *ps; struct pmc_samplebuffer *sb; md = NULL; error = 0; #ifdef DEBUG /* parse debug flags first */ if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr, sizeof(pmc_debugstr))) pmc_debugflags_parse(pmc_debugstr, pmc_debugstr+strlen(pmc_debugstr)); #endif PMCDBG(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION); /* check kernel version */ if (pmc_kernel_version != PMC_VERSION) { if (pmc_kernel_version == 0) printf("hwpmc: this kernel has not been compiled with " "'options HWPMC_HOOKS'.\n"); else printf("hwpmc: kernel version (0x%x) does not match " "module version (0x%x).\n", pmc_kernel_version, PMC_VERSION); return EPROGMISMATCH; } /* * check sysctl parameters */ if (pmc_hashsize <= 0) { (void) printf("hwpmc: tunable \"hashsize\"=%d must be " "greater than zero.\n", pmc_hashsize); pmc_hashsize = PMC_HASH_SIZE; } if (pmc_nsamples <= 0 || pmc_nsamples > 65535) { (void) printf("hwpmc: tunable \"nsamples\"=%d out of " "range.\n", pmc_nsamples); pmc_nsamples = PMC_NSAMPLES; } if (pmc_callchaindepth <= 0 || pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) { (void) printf("hwpmc: tunable \"callchaindepth\"=%d out of " "range.\n", pmc_callchaindepth); pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; } md = pmc_md_initialize(); if (md == NULL || md->pmd_init == NULL) return ENOSYS; /* allocate space for the per-cpu array */ MALLOC(pmc_pcpu, struct pmc_cpu **, mp_ncpus * sizeof(struct pmc_cpu *), M_PMC, M_WAITOK|M_ZERO); /* per-cpu 'saved values' for managing process-mode PMCs */ MALLOC(pmc_pcpu_saved, pmc_value_t *, sizeof(pmc_value_t) * mp_ncpus * md->pmd_npmc, M_PMC, M_WAITOK); /* perform cpu dependent initialization */ pmc_save_cpu_binding(&pb); for (cpu = 0; cpu < mp_ncpus; cpu++) { if (pmc_cpu_is_disabled(cpu)) continue; pmc_select_cpu(cpu); if ((error = md->pmd_init(cpu)) != 0) break; } pmc_restore_cpu_binding(&pb); if (error != 0) return error; /* allocate space for the sample array */ for (cpu = 0; cpu < mp_ncpus; cpu++) { if (pmc_cpu_is_disabled(cpu)) continue; MALLOC(sb, struct pmc_samplebuffer *, sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, M_WAITOK|M_ZERO); sb->ps_read = sb->ps_write = sb->ps_samples; sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); MALLOC(sb->ps_callchains, uintptr_t *, pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, M_WAITOK|M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb = sb; } /* allocate space for the row disposition array */ pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc, M_PMC, M_WAITOK|M_ZERO); KASSERT(pmc_pmcdisp != NULL, ("[pmc,%d] pmcdisp allocation returned NULL", __LINE__)); /* mark all PMCs as available */ for (n = 0; n < (int) md->pmd_npmc; n++) PMC_MARK_ROW_FREE(n); /* allocate thread hash tables */ pmc_ownerhash = hashinit(pmc_hashsize, M_PMC, &pmc_ownerhashmask); pmc_processhash = hashinit(pmc_hashsize, M_PMC, &pmc_processhashmask); mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc-leaf", MTX_SPIN); LIST_INIT(&pmc_ss_owners); pmc_ss_count = 0; /* allocate a pool of spin mutexes */ pmc_mtxpool = mtx_pool_create("pmc-leaf", pmc_mtxpool_size, MTX_SPIN); PMCDBG(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx " "targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask, pmc_processhash, pmc_processhashmask); /* register process {exit,fork,exec} handlers */ pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit, pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY); pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork, pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY); /* initialize logging */ pmclog_initialize(); /* set hook functions */ pmc_intr = md->pmd_intr; pmc_hook = pmc_hook_handler; if (error == 0) { printf(PMC_MODULE_NAME ":"); for (n = 0; n < (int) md->pmd_nclass; n++) { printf(" %s/%d/0x%b", pmc_name_of_pmcclass[md->pmd_classes[n].pm_class], md->pmd_nclasspmcs[n], md->pmd_classes[n].pm_caps, "\20" "\1INT\2USR\3SYS\4EDG\5THR" "\6REA\7WRI\10INV\11QUA\12PRC" "\13TAG\14CSC"); } printf("\n"); } return error; } /* prepare to be unloaded */ static void pmc_cleanup(void) { int cpu; struct pmc_ownerhash *ph; struct pmc_owner *po, *tmp; struct pmc_binding pb; #ifdef DEBUG struct pmc_processhash *prh; #endif PMCDBG(MOD,INI,0, "%s", "cleanup"); /* switch off sampling */ atomic_store_rel_int(&pmc_cpumask, 0); pmc_intr = NULL; sx_xlock(&pmc_sx); if (pmc_hook == NULL) { /* being unloaded already */ sx_xunlock(&pmc_sx); return; } pmc_hook = NULL; /* prevent new threads from entering module */ /* deregister event handlers */ EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag); EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag); /* send SIGBUS to all owner threads, free up allocations */ if (pmc_ownerhash) for (ph = pmc_ownerhash; ph <= &pmc_ownerhash[pmc_ownerhashmask]; ph++) { LIST_FOREACH_SAFE(po, ph, po_next, tmp) { pmc_remove_owner(po); /* send SIGBUS to owner processes */ PMCDBG(MOD,INI,2, "cleanup signal proc=%p " "(%d, %s)", po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm); PROC_LOCK(po->po_owner); psignal(po->po_owner, SIGBUS); PROC_UNLOCK(po->po_owner); pmc_destroy_owner_descriptor(po); } } /* reclaim allocated data structures */ if (pmc_mtxpool) mtx_pool_destroy(&pmc_mtxpool); mtx_destroy(&pmc_processhash_mtx); if (pmc_processhash) { #ifdef DEBUG struct pmc_process *pp; PMCDBG(MOD,INI,3, "%s", "destroy process hash"); for (prh = pmc_processhash; prh <= &pmc_processhash[pmc_processhashmask]; prh++) LIST_FOREACH(pp, prh, pp_next) PMCDBG(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid); #endif hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask); pmc_processhash = NULL; } if (pmc_ownerhash) { PMCDBG(MOD,INI,3, "%s", "destroy owner hash"); hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask); pmc_ownerhash = NULL; } KASSERT(LIST_EMPTY(&pmc_ss_owners), ("[pmc,%d] Global SS owner list not empty", __LINE__)); KASSERT(pmc_ss_count == 0, ("[pmc,%d] Global SS count not empty", __LINE__)); /* free the per-cpu sample buffers */ for (cpu = 0; cpu < mp_ncpus; cpu++) { if (pmc_cpu_is_disabled(cpu)) continue; KASSERT(pmc_pcpu[cpu]->pc_sb != NULL, ("[pmc,%d] Null cpu sample buffer cpu=%d", __LINE__, cpu)); FREE(pmc_pcpu[cpu]->pc_sb->ps_callchains, M_PMC); FREE(pmc_pcpu[cpu]->pc_sb, M_PMC); pmc_pcpu[cpu]->pc_sb = NULL; } /* do processor dependent cleanup */ PMCDBG(MOD,INI,3, "%s", "md cleanup"); if (md) { pmc_save_cpu_binding(&pb); for (cpu = 0; cpu < mp_ncpus; cpu++) { PMCDBG(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p", cpu, pmc_pcpu[cpu]); if (pmc_cpu_is_disabled(cpu)) continue; pmc_select_cpu(cpu); if (pmc_pcpu[cpu]) (void) md->pmd_cleanup(cpu); } FREE(md, M_PMC); md = NULL; pmc_restore_cpu_binding(&pb); } /* deallocate per-cpu structures */ FREE(pmc_pcpu, M_PMC); pmc_pcpu = NULL; FREE(pmc_pcpu_saved, M_PMC); pmc_pcpu_saved = NULL; if (pmc_pmcdisp) { FREE(pmc_pmcdisp, M_PMC); pmc_pmcdisp = NULL; } pmclog_shutdown(); sx_xunlock(&pmc_sx); /* we are done */ } /* * The function called at load/unload. */ static int load (struct module *module __unused, int cmd, void *arg __unused) { int error; error = 0; switch (cmd) { case MOD_LOAD : /* initialize the subsystem */ error = pmc_initialize(); if (error != 0) break; PMCDBG(MOD,INI,1, "syscall=%d ncpus=%d", pmc_syscall_num, mp_ncpus); break; case MOD_UNLOAD : case MOD_SHUTDOWN: pmc_cleanup(); PMCDBG(MOD,INI,1, "%s", "unloaded"); break; default : error = EINVAL; /* XXX should panic(9) */ break; } return error; } /* memory pool */ MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module"); Index: head/sys/dev/md/md.c =================================================================== --- head/sys/dev/md/md.c (revision 175201) +++ head/sys/dev/md/md.c (revision 175202) @@ -1,1307 +1,1307 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ * */ /*- * The following functions are based in the vn(4) driver: mdstart_swap(), * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), * and as such under the following copyright: * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: vn.c 1.13 94/04/02 * * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 */ #include "opt_geom.h" #include "opt_md.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MD_MODVER 1 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ #define MD_EXITING 0x20000 /* Worker thread is exiting. */ #ifndef MD_NSECT #define MD_NSECT (10000 * 2) #endif static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk"); static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors"); static int md_debug; SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, ""); #if defined(MD_ROOT) && defined(MD_ROOT_SIZE) /* * Preloaded image gets put here. * Applications that patch the object with the image can determine * the size looking at the start and end markers (strings), * so we want them contiguous. */ static struct { u_char start[MD_ROOT_SIZE*1024]; u_char end[128]; } mfs_root = { .start = "MFS Filesystem goes here", .end = "MFS Filesystem had better STOP here", }; #endif static g_init_t g_md_init; static g_fini_t g_md_fini; static g_start_t g_md_start; static g_access_t g_md_access; static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp); static int mdunits; static struct cdev *status_dev = 0; static struct sx md_sx; static d_ioctl_t mdctlioctl; static struct cdevsw mdctl_cdevsw = { .d_version = D_VERSION, .d_ioctl = mdctlioctl, .d_name = MD_NAME, }; struct g_class g_md_class = { .name = "MD", .version = G_VERSION, .init = g_md_init, .fini = g_md_fini, .start = g_md_start, .access = g_md_access, .dumpconf = g_md_dumpconf, }; DECLARE_GEOM_CLASS(g_md_class, g_md); static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list); #define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) #define NMASK (NINDIR-1) static int nshift; struct indir { uintptr_t *array; u_int total; u_int used; u_int shift; }; struct md_s { int unit; LIST_ENTRY(md_s) list; struct bio_queue_head bio_queue; struct mtx queue_mtx; struct cdev *dev; enum md_types type; off_t mediasize; unsigned sectorsize; unsigned opencount; unsigned fwheads; unsigned fwsectors; unsigned flags; char name[20]; struct proc *procp; struct g_geom *gp; struct g_provider *pp; int (*start)(struct md_s *sc, struct bio *bp); struct devstat *devstat; /* MD_MALLOC related fields */ struct indir *indir; uma_zone_t uma; /* MD_PRELOAD related fields */ u_char *pl_ptr; size_t pl_len; /* MD_VNODE related fields */ struct vnode *vnode; char file[PATH_MAX]; struct ucred *cred; /* MD_SWAP related fields */ vm_object_t object; }; static struct indir * new_indir(u_int shift) { struct indir *ip; ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO); if (ip == NULL) return (NULL); ip->array = malloc(sizeof(uintptr_t) * NINDIR, M_MDSECT, M_NOWAIT | M_ZERO); if (ip->array == NULL) { free(ip, M_MD); return (NULL); } ip->total = NINDIR; ip->shift = shift; return (ip); } static void del_indir(struct indir *ip) { free(ip->array, M_MDSECT); free(ip, M_MD); } static void destroy_indir(struct md_s *sc, struct indir *ip) { int i; for (i = 0; i < NINDIR; i++) { if (!ip->array[i]) continue; if (ip->shift) destroy_indir(sc, (struct indir*)(ip->array[i])); else if (ip->array[i] > 255) uma_zfree(sc->uma, (void *)(ip->array[i])); } del_indir(ip); } /* * This function does the math and allocates the top level "indir" structure * for a device of "size" sectors. */ static struct indir * dimension(off_t size) { off_t rcnt; struct indir *ip; int i, layer; rcnt = size; layer = 0; while (rcnt > NINDIR) { rcnt /= NINDIR; layer++; } /* figure out log2(NINDIR) */ for (i = NINDIR, nshift = -1; i; nshift++) i >>= 1; /* * XXX: the top layer is probably not fully populated, so we allocate * too much space for ip->array in here. */ ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO); ip->array = malloc(sizeof(uintptr_t) * NINDIR, M_MDSECT, M_WAITOK | M_ZERO); ip->total = NINDIR; ip->shift = layer * nshift; return (ip); } /* * Read a given sector */ static uintptr_t s_read(struct indir *ip, off_t offset) { struct indir *cip; int idx; uintptr_t up; if (md_debug > 1) printf("s_read(%jd)\n", (intmax_t)offset); up = 0; for (cip = ip; cip != NULL;) { if (cip->shift) { idx = (offset >> cip->shift) & NMASK; up = cip->array[idx]; cip = (struct indir *)up; continue; } idx = offset & NMASK; return (cip->array[idx]); } return (0); } /* * Write a given sector, prune the tree if the value is 0 */ static int s_write(struct indir *ip, off_t offset, uintptr_t ptr) { struct indir *cip, *lip[10]; int idx, li; uintptr_t up; if (md_debug > 1) printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); up = 0; li = 0; cip = ip; for (;;) { lip[li++] = cip; if (cip->shift) { idx = (offset >> cip->shift) & NMASK; up = cip->array[idx]; if (up != 0) { cip = (struct indir *)up; continue; } /* Allocate branch */ cip->array[idx] = (uintptr_t)new_indir(cip->shift - nshift); if (cip->array[idx] == 0) return (ENOSPC); cip->used++; up = cip->array[idx]; cip = (struct indir *)up; continue; } /* leafnode */ idx = offset & NMASK; up = cip->array[idx]; if (up != 0) cip->used--; cip->array[idx] = ptr; if (ptr != 0) cip->used++; break; } if (cip->used != 0 || li == 1) return (0); li--; while (cip->used == 0 && cip != ip) { li--; idx = (offset >> lip[li]->shift) & NMASK; up = lip[li]->array[idx]; KASSERT(up == (uintptr_t)cip, ("md screwed up")); del_indir(cip); lip[li]->array[idx] = 0; lip[li]->used--; cip = lip[li]; } return (0); } static int g_md_access(struct g_provider *pp, int r, int w, int e) { struct md_s *sc; sc = pp->geom->softc; if (sc == NULL) return (ENXIO); r += pp->acr; w += pp->acw; e += pp->ace; if ((sc->flags & MD_READONLY) != 0 && w > 0) return (EROFS); if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { sc->opencount = 1; } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { sc->opencount = 0; } return (0); } static void g_md_start(struct bio *bp) { struct md_s *sc; sc = bp->bio_to->geom->softc; if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) devstat_start_transaction_bio(sc->devstat, bp); mtx_lock(&sc->queue_mtx); bioq_disksort(&sc->bio_queue, bp); mtx_unlock(&sc->queue_mtx); wakeup(sc); } static int mdstart_malloc(struct md_s *sc, struct bio *bp) { int i, error; u_char *dst; off_t secno, nsec, uc; uintptr_t sp, osp; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: return (EOPNOTSUPP); } nsec = bp->bio_length / sc->sectorsize; secno = bp->bio_offset / sc->sectorsize; dst = bp->bio_data; error = 0; while (nsec--) { osp = s_read(sc->indir, secno); if (bp->bio_cmd == BIO_DELETE) { if (osp != 0) error = s_write(sc->indir, secno, 0); } else if (bp->bio_cmd == BIO_READ) { if (osp == 0) bzero(dst, sc->sectorsize); else if (osp <= 255) for (i = 0; i < sc->sectorsize; i++) dst[i] = osp; else bcopy((void *)osp, dst, sc->sectorsize); osp = 0; } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { uc = dst[0]; for (i = 1; i < sc->sectorsize; i++) if (dst[i] != uc) break; } else { i = 0; uc = 0; } if (i == sc->sectorsize) { if (osp != uc) error = s_write(sc->indir, secno, uc); } else { if (osp <= 255) { sp = (uintptr_t)uma_zalloc(sc->uma, M_NOWAIT); if (sp == 0) { error = ENOSPC; break; } bcopy(dst, (void *)sp, sc->sectorsize); error = s_write(sc->indir, secno, sp); } else { bcopy(dst, (void *)osp, sc->sectorsize); osp = 0; } } } else { error = EOPNOTSUPP; } if (osp > 255) uma_zfree(sc->uma, (void*)osp); if (error != 0) break; secno++; dst += sc->sectorsize; } bp->bio_resid = 0; return (error); } static int mdstart_preload(struct md_s *sc, struct bio *bp) { switch (bp->bio_cmd) { case BIO_READ: bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data, bp->bio_length); break; case BIO_WRITE: bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset, bp->bio_length); break; } bp->bio_resid = 0; return (0); } static int mdstart_vnode(struct md_s *sc, struct bio *bp) { int error, vfslocked; struct uio auio; struct iovec aiov; struct mount *mp; struct vnode *vp; struct thread *td; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_FLUSH: break; default: return (EOPNOTSUPP); } td = curthread; vp = sc->vnode; /* * VNODE I/O * * If an error occurs, we set BIO_ERROR but we do not set * B_INVAL because (for a write anyway), the buffer is * still valid. */ if (bp->bio_cmd == BIO_FLUSH) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); (void) vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } bzero(&auio, sizeof(auio)); aiov.iov_base = bp->bio_data; aiov.iov_len = bp->bio_length; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = (vm_ooffset_t)bp->bio_offset; auio.uio_segflg = UIO_SYSSPACE; if (bp->bio_cmd == BIO_READ) auio.uio_rw = UIO_READ; else if (bp->bio_cmd == BIO_WRITE) auio.uio_rw = UIO_WRITE; else panic("wrong BIO_OP in mdstart_vnode"); auio.uio_resid = bp->bio_length; auio.uio_td = td; /* * When reading set IO_DIRECT to try to avoid double-caching * the data. When writing IO_DIRECT is not optimal. */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (bp->bio_cmd == BIO_READ) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred); VOP_UNLOCK(vp, 0, td); } else { (void) vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); } VFS_UNLOCK_GIANT(vfslocked); bp->bio_resid = auio.uio_resid; return (error); } static int mdstart_swap(struct md_s *sc, struct bio *bp) { struct sf_buf *sf; int rv, offs, len, lastend; vm_pindex_t i, lastp; vm_page_t m; u_char *p; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: return (EOPNOTSUPP); } p = bp->bio_data; /* * offs is the offset at which to start operating on the * next (ie, first) page. lastp is the last page on * which we're going to operate. lastend is the ending * position within that last page (ie, PAGE_SIZE if * we're operating on complete aligned pages). */ offs = bp->bio_offset % PAGE_SIZE; lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; rv = VM_PAGER_OK; VM_OBJECT_LOCK(sc->object); vm_object_pip_add(sc->object, 1); for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; m = vm_page_grab(sc->object, i, VM_ALLOC_NORMAL|VM_ALLOC_RETRY); VM_OBJECT_UNLOCK(sc->object); sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); VM_OBJECT_LOCK(sc->object); if (bp->bio_cmd == BIO_READ) { if (m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { sf_buf_free(sf); sched_unpin(); vm_page_lock_queues(); vm_page_wakeup(m); vm_page_unlock_queues(); break; } bcopy((void *)(sf_buf_kva(sf) + offs), p, len); } else if (bp->bio_cmd == BIO_WRITE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { sf_buf_free(sf); sched_unpin(); vm_page_lock_queues(); vm_page_wakeup(m); vm_page_unlock_queues(); break; } bcopy(p, (void *)(sf_buf_kva(sf) + offs), len); m->valid = VM_PAGE_BITS_ALL; #if 0 } else if (bp->bio_cmd == BIO_DELETE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { sf_buf_free(sf); sched_unpin(); vm_page_lock_queues(); vm_page_wakeup(m); vm_page_unlock_queues(); break; } bzero((void *)(sf_buf_kva(sf) + offs), len); vm_page_dirty(m); m->valid = VM_PAGE_BITS_ALL; #endif } sf_buf_free(sf); sched_unpin(); vm_page_lock_queues(); vm_page_wakeup(m); vm_page_activate(m); if (bp->bio_cmd == BIO_WRITE) vm_page_dirty(m); vm_page_unlock_queues(); /* Actions on further pages start at offset 0 */ p += PAGE_SIZE - offs; offs = 0; #if 0 if (bootverbose || bp->bio_offset / PAGE_SIZE < 17) printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n", m->wire_count, m->busy, m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i); #endif } vm_object_pip_subtract(sc->object, 1); vm_object_set_writeable_dirty(sc->object); VM_OBJECT_UNLOCK(sc->object); return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); } static void md_kthread(void *arg) { struct md_s *sc; struct bio *bp; int error; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); if (sc->type == MD_VNODE) curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { mtx_lock(&sc->queue_mtx); if (sc->flags & MD_SHUTDOWN) { sc->flags |= MD_EXITING; mtx_unlock(&sc->queue_mtx); kproc_exit(0); } bp = bioq_takefirst(&sc->bio_queue); if (!bp) { msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0); continue; } mtx_unlock(&sc->queue_mtx); if (bp->bio_cmd == BIO_GETATTR) { if (sc->fwsectors && sc->fwheads && (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) || g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads))) error = -1; else error = EOPNOTSUPP; } else { error = sc->start(sc, bp); } if (error != -1) { bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) devstat_end_transaction_bio(sc->devstat, bp); } } } static struct md_s * mdfind(int unit) { struct md_s *sc; LIST_FOREACH(sc, &md_softc_list, list) { if (sc->unit == unit) break; } return (sc); } static struct md_s * mdnew(int unit, int *errp, enum md_types type) { struct md_s *sc, *sc2; int error, max = -1; *errp = 0; LIST_FOREACH(sc2, &md_softc_list, list) { if (unit == sc2->unit) { *errp = EBUSY; return (NULL); } if (unit == -1 && sc2->unit > max) max = sc2->unit; } if (unit == -1) unit = max + 1; sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); sc->type = type; bioq_init(&sc->bio_queue); mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF); sc->unit = unit; sprintf(sc->name, "md%d", unit); LIST_INSERT_HEAD(&md_softc_list, sc, list); error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name); if (error == 0) return (sc); LIST_REMOVE(sc, list); mtx_destroy(&sc->queue_mtx); free(sc, M_MD); *errp = error; return (NULL); } static void mdinit(struct md_s *sc) { struct g_geom *gp; struct g_provider *pp; g_topology_lock(); gp = g_new_geomf(&g_md_class, "md%d", sc->unit); gp->softc = sc; pp = g_new_providerf(gp, "md%d", sc->unit); pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; sc->gp = gp; sc->pp = pp; g_error_provider(pp, 0); g_topology_unlock(); sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); } /* * XXX: we should check that the range they feed us is mapped. * XXX: we should implement read-only. */ static int mdcreate_preload(struct md_s *sc, struct md_ioctl *mdio) { if (mdio->md_options & ~(MD_AUTOUNIT | MD_FORCE)) return (EINVAL); sc->flags = mdio->md_options & MD_FORCE; /* Cast to pointer size, then to pointer to avoid warning */ sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base; sc->pl_len = (size_t)sc->mediasize; return (0); } static int mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio) { uintptr_t sp; int error; off_t u; error = 0; if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) return (EINVAL); if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize)) return (EINVAL); /* Compression doesn't make sense if we have reserved space */ if (mdio->md_options & MD_RESERVE) mdio->md_options &= ~MD_COMPRESS; if (mdio->md_fwsectors != 0) sc->fwsectors = mdio->md_fwsectors; if (mdio->md_fwheads != 0) sc->fwheads = mdio->md_fwheads; sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE); sc->indir = dimension(sc->mediasize / sc->sectorsize); sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL, 0x1ff, 0); if (mdio->md_options & MD_RESERVE) { off_t nsectors; nsectors = sc->mediasize / sc->sectorsize; for (u = 0; u < nsectors; u++) { sp = (uintptr_t)uma_zalloc(sc->uma, M_NOWAIT | M_ZERO); if (sp != 0) error = s_write(sc->indir, u, sp); else error = ENOMEM; if (error != 0) break; } } return (error); } static int mdsetcred(struct md_s *sc, struct ucred *cred) { char *tmpbuf; int error = 0; /* * Set credits in our softc */ if (sc->cred) crfree(sc->cred); sc->cred = crhold(cred); /* * Horrible kludge to establish credentials for NFS XXX. */ if (sc->vnode) { struct uio auio; struct iovec aiov; tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK); bzero(&auio, sizeof(auio)); aiov.iov_base = tmpbuf; aiov.iov_len = sc->sectorsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = aiov.iov_len; - vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(sc->vnode, &auio, 0, sc->cred); VOP_UNLOCK(sc->vnode, 0, curthread); free(tmpbuf, M_TEMP); } return (error); } static int mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) { struct vattr vattr; struct nameidata nd; int error, flags, vfslocked; error = copyinstr(mdio->md_file, sc->file, sizeof(sc->file), NULL); if (error != 0) return (error); flags = FREAD|FWRITE; /* * If the user specified that this is a read only device, unset the * FWRITE mask before trying to open the backing store. */ if ((mdio->md_options & MD_READONLY) != 0) flags &= ~FWRITE; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, sc->file, td); error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG || (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) { VOP_UNLOCK(nd.ni_vp, 0, td); (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (error ? error : EINVAL); } nd.ni_vp->v_vflag |= VV_MD; VOP_UNLOCK(nd.ni_vp, 0, td); if (mdio->md_fwsectors != 0) sc->fwsectors = mdio->md_fwsectors; if (mdio->md_fwheads != 0) sc->fwheads = mdio->md_fwheads; sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC); if (!(flags & FWRITE)) sc->flags |= MD_READONLY; sc->vnode = nd.ni_vp; error = mdsetcred(sc, td->td_ucred); if (error != 0) { - vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); nd.ni_vp->v_vflag &= ~VV_MD; VOP_UNLOCK(nd.ni_vp, 0, td); (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } VFS_UNLOCK_GIANT(vfslocked); return (0); } static int mddestroy(struct md_s *sc, struct thread *td) { int vfslocked; if (sc->gp) { sc->gp->softc = NULL; g_topology_lock(); g_wither_geom(sc->gp, ENXIO); g_topology_unlock(); sc->gp = NULL; sc->pp = NULL; } if (sc->devstat) { devstat_remove_entry(sc->devstat); sc->devstat = NULL; } mtx_lock(&sc->queue_mtx); sc->flags |= MD_SHUTDOWN; wakeup(sc); while (!(sc->flags & MD_EXITING)) msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10); mtx_unlock(&sc->queue_mtx); mtx_destroy(&sc->queue_mtx); if (sc->vnode != NULL) { vfslocked = VFS_LOCK_GIANT(sc->vnode->v_mount); - vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); sc->vnode->v_vflag &= ~VV_MD; VOP_UNLOCK(sc->vnode, 0, td); (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? FREAD : (FREAD|FWRITE), sc->cred, td); VFS_UNLOCK_GIANT(vfslocked); } if (sc->cred != NULL) crfree(sc->cred); if (sc->object != NULL) vm_object_deallocate(sc->object); if (sc->indir) destroy_indir(sc, sc->indir); if (sc->uma) uma_zdestroy(sc->uma); LIST_REMOVE(sc, list); free(sc, M_MD); return (0); } static int mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) { vm_ooffset_t npage; int error; /* * Range check. Disallow negative sizes or any size less then the * size of a page. Then round to a page. */ if (sc->mediasize == 0 || (sc->mediasize % PAGE_SIZE) != 0) return (EDOM); /* * Allocate an OBJT_SWAP object. * * Note the truncation. */ npage = mdio->md_mediasize / PAGE_SIZE; if (mdio->md_fwsectors != 0) sc->fwsectors = mdio->md_fwsectors; if (mdio->md_fwheads != 0) sc->fwheads = mdio->md_fwheads; sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, VM_PROT_DEFAULT, 0); if (sc->object == NULL) return (ENOMEM); sc->flags = mdio->md_options & MD_FORCE; if (mdio->md_options & MD_RESERVE) { if (swap_pager_reserve(sc->object, 0, npage) < 0) { vm_object_deallocate(sc->object); sc->object = NULL; return (EDOM); } } error = mdsetcred(sc, td->td_ucred); if (error != 0) { vm_object_deallocate(sc->object); sc->object = NULL; } return (error); } static int xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct md_ioctl *mdio; struct md_s *sc; int error, i; if (md_debug) printf("mdctlioctl(%s %lx %p %x %p)\n", devtoname(dev), cmd, addr, flags, td); mdio = (struct md_ioctl *)addr; if (mdio->md_version != MDIOVERSION) return (EINVAL); /* * We assert the version number in the individual ioctl * handlers instead of out here because (a) it is possible we * may add another ioctl in the future which doesn't read an * mdio, and (b) the correct return value for an unknown ioctl * is ENOIOCTL, not EINVAL. */ error = 0; switch (cmd) { case MDIOCATTACH: switch (mdio->md_type) { case MD_MALLOC: case MD_PRELOAD: case MD_VNODE: case MD_SWAP: break; default: return (EINVAL); } if (mdio->md_options & MD_AUTOUNIT) sc = mdnew(-1, &error, mdio->md_type); else sc = mdnew(mdio->md_unit, &error, mdio->md_type); if (sc == NULL) return (error); if (mdio->md_options & MD_AUTOUNIT) mdio->md_unit = sc->unit; sc->mediasize = mdio->md_mediasize; if (mdio->md_sectorsize == 0) sc->sectorsize = DEV_BSIZE; else sc->sectorsize = mdio->md_sectorsize; error = EDOOFUS; switch (sc->type) { case MD_MALLOC: sc->start = mdstart_malloc; error = mdcreate_malloc(sc, mdio); break; case MD_PRELOAD: sc->start = mdstart_preload; error = mdcreate_preload(sc, mdio); break; case MD_VNODE: sc->start = mdstart_vnode; error = mdcreate_vnode(sc, mdio, td); break; case MD_SWAP: sc->start = mdstart_swap; error = mdcreate_swap(sc, mdio, td); break; } if (error != 0) { mddestroy(sc, td); return (error); } /* Prune off any residual fractional sector */ i = sc->mediasize % sc->sectorsize; sc->mediasize -= i; mdinit(sc); return (0); case MDIOCDETACH: if (mdio->md_mediasize != 0 || mdio->md_options != 0) return (EINVAL); sc = mdfind(mdio->md_unit); if (sc == NULL) return (ENOENT); if (sc->opencount != 0 && !(sc->flags & MD_FORCE)) return (EBUSY); return (mddestroy(sc, td)); case MDIOCQUERY: sc = mdfind(mdio->md_unit); if (sc == NULL) return (ENOENT); mdio->md_type = sc->type; mdio->md_options = sc->flags; mdio->md_mediasize = sc->mediasize; mdio->md_sectorsize = sc->sectorsize; if (sc->type == MD_VNODE) error = copyout(sc->file, mdio->md_file, strlen(sc->file) + 1); return (error); case MDIOCLIST: i = 1; LIST_FOREACH(sc, &md_softc_list, list) { if (i == MDNPAD - 1) mdio->md_pad[i] = -1; else mdio->md_pad[i++] = sc->unit; } mdio->md_pad[0] = i - 1; return (0); default: return (ENOIOCTL); }; } static int mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { int error; sx_xlock(&md_sx); error = xmdctlioctl(dev, cmd, addr, flags, td); sx_xunlock(&md_sx); return (error); } static void md_preloaded(u_char *image, size_t length) { struct md_s *sc; int error; sc = mdnew(-1, &error, MD_PRELOAD); if (sc == NULL) return; sc->mediasize = length; sc->sectorsize = DEV_BSIZE; sc->pl_ptr = image; sc->pl_len = length; sc->start = mdstart_preload; #ifdef MD_ROOT if (sc->unit == 0) rootdevnames[0] = "ufs:/dev/md0"; #endif mdinit(sc); } static void g_md_init(struct g_class *mp __unused) { caddr_t mod; caddr_t c; u_char *ptr, *name, *type; unsigned len; mod = NULL; sx_init(&md_sx, "MD config lock"); g_topology_unlock(); #ifdef MD_ROOT_SIZE sx_xlock(&md_sx); md_preloaded(mfs_root.start, sizeof(mfs_root.start)); sx_xunlock(&md_sx); #endif /* XXX: are preload_* static or do they need Giant ? */ while ((mod = preload_search_next_name(mod)) != NULL) { name = (char *)preload_search_info(mod, MODINFO_NAME); if (name == NULL) continue; type = (char *)preload_search_info(mod, MODINFO_TYPE); if (type == NULL) continue; if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) continue; c = preload_search_info(mod, MODINFO_ADDR); ptr = *(u_char **)c; c = preload_search_info(mod, MODINFO_SIZE); len = *(size_t *)c; printf("%s%d: Preloaded image <%s> %d bytes at %p\n", MD_NAME, mdunits, name, len, ptr); sx_xlock(&md_sx); md_preloaded(ptr, len); sx_xunlock(&md_sx); } status_dev = make_dev(&mdctl_cdevsw, MAXMINOR, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock(); } static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct md_s *mp; char *type; mp = gp->softc; if (mp == NULL) return; switch (mp->type) { case MD_MALLOC: type = "malloc"; break; case MD_PRELOAD: type = "preload"; break; case MD_VNODE: type = "vnode"; break; case MD_SWAP: type = "swap"; break; default: type = "unknown"; break; } if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " u %d", mp->unit); sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize); sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads); sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors); sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize); sbuf_printf(sb, " t %s", type); if (mp->type == MD_VNODE && mp->vnode != NULL) sbuf_printf(sb, " file %s", mp->file); } else { sbuf_printf(sb, "%s%d\n", indent, mp->unit); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->sectorsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->fwheads); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->fwsectors); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->mediasize); sbuf_printf(sb, "%s%s\n", indent, type); if (mp->type == MD_VNODE && mp->vnode != NULL) sbuf_printf(sb, "%s%s\n", indent, mp->file); } } } static void g_md_fini(struct g_class *mp __unused) { sx_destroy(&md_sx); if (status_dev != NULL) destroy_dev(status_dev); } Index: head/sys/fs/cd9660/cd9660_lookup.c =================================================================== --- head/sys/fs/cd9660/cd9660_lookup.c (revision 175201) +++ head/sys/fs/cd9660/cd9660_lookup.c (revision 175202) @@ -1,430 +1,430 @@ /*- * Copyright (c) 1989, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_lookup.c 7.33 (Berkeley) 5/19/91 * @(#)cd9660_lookup.c 8.2 (Berkeley) 1/23/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the filesystem is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and iput * instead of two iputs. * * Overall outline of ufs_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache * * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked. */ int cd9660_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *vdp; /* vnode for directory being searched */ struct iso_node *dp; /* inode for directory being searched */ struct iso_mnt *imp; /* filesystem that directory is in */ struct buf *bp; /* a buffer of directory entries */ struct iso_directory_record *ep = 0;/* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ int saveoffset = 0; /* offset of last directory entry in dir */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by cd9660_vget_internal */ u_long bmask; /* block offset mask */ int error; ino_t ino = 0, saved_ino; int reclen; u_short namelen; int isoflags; char altname[NAME_MAX]; int res; int assoc, len; char *name; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; struct thread *td = cnp->cn_thread; bp = NULL; *vpp = NULL; vdp = ap->a_dvp; dp = VTOI(vdp); imp = dp->i_mnt; /* * We now have a segment name to search for, and a directory to search. */ len = cnp->cn_namelen; name = cnp->cn_nameptr; /* * A leading `=' means, we are looking for an associated file */ if ((assoc = (imp->iso_ftype != ISO_FTYPE_RRIP && *name == ASSOCCHAR))) { len--; name++; } /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ bmask = imp->im_bmask; if (nameiop != LOOKUP || dp->i_diroff == 0 || dp->i_diroff > dp->i_size) { entryoffsetinblock = 0; dp->i_offset = 0; numdirpasses = 1; } else { dp->i_offset = dp->i_diroff; if ((entryoffsetinblock = dp->i_offset & bmask) && (error = cd9660_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } endsearch = dp->i_size; searchloop: while (dp->i_offset < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((dp->i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp)) != 0) return (error); entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ dp->i_offset = (dp->i_offset & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) /* illegal entry, stop */ break; if (entryoffsetinblock + reclen > imp->logical_block_size) /* entries are not allowed to cross boundaries */ break; namelen = isonum_711(ep->name_len); isoflags = isonum_711(imp->iso_ftype == ISO_FTYPE_HIGH_SIERRA? &ep->date[6]: ep->flags); if (reclen < ISO_DIRECTORY_RECORD_SIZE + namelen) /* illegal entry, stop */ break; /* * Check for a name match. */ switch (imp->iso_ftype) { default: if (!(isoflags & 4) == !assoc) { if ((len == 1 && *name == '.') || (flags & ISDOTDOT)) { if (namelen == 1 && ep->name[0] == ((flags & ISDOTDOT) ? 1 : 0)) { /* * Save directory entry's inode number and * release directory buffer. */ dp->i_ino = isodirino(ep, imp); goto found; } if (namelen != 1 || ep->name[0] != 0) goto notfound; } else if (!(res = isofncmp(name, len, ep->name, namelen, imp->joliet_level, imp->im_flags, imp->im_d2l, imp->im_l2d))) { if (isoflags & 2) ino = isodirino(ep, imp); else ino = dbtob(bp->b_blkno) + entryoffsetinblock; saveoffset = dp->i_offset; } else if (ino) goto foundino; #ifdef NOSORTBUG /* On some CDs directory entries are not sorted correctly */ else if (res < 0) goto notfound; else if (res > 0 && numdirpasses == 2) numdirpasses++; #endif } break; case ISO_FTYPE_RRIP: if (isonum_711(ep->flags)&2) ino = isodirino(ep, imp); else ino = dbtob(bp->b_blkno) + entryoffsetinblock; dp->i_ino = ino; cd9660_rrip_getname(ep,altname,&namelen,&dp->i_ino,imp); if (namelen == cnp->cn_namelen && !bcmp(name,altname,namelen)) goto found; ino = 0; break; } dp->i_offset += reclen; entryoffsetinblock += reclen; } if (ino) { foundino: dp->i_ino = ino; if (saveoffset != dp->i_offset) { if (lblkno(imp, dp->i_offset) != lblkno(imp, saveoffset)) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)saveoffset, NULL, &bp)) != 0) return (error); } entryoffsetinblock = saveoffset & bmask; ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); dp->i_offset = saveoffset; } goto found; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; dp->i_offset = 0; endsearch = dp->i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * Insert name into cache (as non-existent) if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); if (nameiop == CREATE || nameiop == RENAME) return (EROFS); return (ENOENT); found: if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = dp->i_offset; /* * Step through the translation in the name. We do not `iput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the `iget' for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; /* * If ino is different from dp->i_ino, * it's a relocated directory. */ if (flags & ISDOTDOT) { saved_ino = dp->i_ino; VOP_UNLOCK(pdp, 0, td); /* race to get the inode */ error = cd9660_vget_internal(vdp->v_mount, saved_ino, LK_EXCLUSIVE, &tdp, saved_ino != ino, ep); brelse(bp); - vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); *vpp = tdp; } else if (dp->i_number == dp->i_ino) { brelse(bp); VREF(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { error = cd9660_vget_internal(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp, dp->i_ino != ino, ep); brelse(bp); if (error) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ int cd9660_blkatoff(vp, offset, res, bpp) struct vnode *vp; off_t offset; char **res; struct buf **bpp; { struct iso_node *ip; struct iso_mnt *imp; struct buf *bp; daddr_t lbn; int bsize, bshift, error; ip = VTOI(vp); imp = ip->i_mnt; lbn = lblkno(imp, offset); bsize = blksize(imp, ip, lbn); bshift = imp->im_bshift; if ((error = bread(vp, lbn, bsize, NOCRED, &bp)) != 0) { brelse(bp); *bpp = NULL; return (error); } /* * We must BMAP the buffer because the directory code may use b_blkno * to calculate the inode for certain types of directory entries. * We could get away with not doing it before we VMIO-backed the * directories because the buffers would get freed atomically with * the invalidation of their data. But with VMIO-backed buffers * the buffers may be freed and then later reconstituted - and the * reconstituted buffer will have no knowledge of b_blkno. */ if (bp->b_blkno == bp->b_lblkno) { bp->b_blkno = (ip->iso_start + bp->b_lblkno) << (bshift - DEV_BSHIFT); } if (res) *res = (char *)bp->b_data + blkoff(imp, offset); *bpp = bp; return (0); } Index: head/sys/fs/cd9660/cd9660_vfsops.c =================================================================== --- head/sys/fs/cd9660/cd9660_vfsops.c (revision 175201) +++ head/sys/fs/cd9660/cd9660_vfsops.c (revision 175202) @@ -1,823 +1,823 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure"); MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part"); struct iconv_functions *cd9660_iconv = NULL; static vfs_mount_t cd9660_mount; static vfs_cmount_t cd9660_cmount; static vfs_unmount_t cd9660_unmount; static vfs_root_t cd9660_root; static vfs_statfs_t cd9660_statfs; static vfs_vget_t cd9660_vget; static vfs_fhtovp_t cd9660_fhtovp; static struct vfsops cd9660_vfsops = { .vfs_fhtovp = cd9660_fhtovp, .vfs_mount = cd9660_mount, .vfs_cmount = cd9660_cmount, .vfs_root = cd9660_root, .vfs_statfs = cd9660_statfs, .vfs_unmount = cd9660_unmount, .vfs_vget = cd9660_vget, }; VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY); MODULE_VERSION(cd9660, 1); static int iso_mountfs(struct vnode *devvp, struct mount *mp, struct thread *td); /* * VFS Operations. */ static int cd9660_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct iso_args args; int error; error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64); ma = mount_argsu(ma, "cs_local", args.cs_local, 64); ma = mount_argf(ma, "ssector", "%u", args.ssector); ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip"); ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens"); ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt"); ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet"); ma = mount_argb(ma, args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet"); ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv"); error = kernel_mount(ma, flags); return (error); } static int cd9660_mount(struct mount *mp, struct thread *td) { struct vnode *devvp; char *fspec; int error; mode_t accessmode; struct nameidata ndp; struct iso_mnt *imp = 0; /* * Unconditionally mount as read-only. */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); imp = VFSTOISOFS(mp); if (mp->mnt_flag & MNT_UPDATE) { if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) return (0); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(&ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp))) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vrele(devvp); return (error); } /* * Verify that user has necessary permissions on the device, * or has superuser abilities */ accessmode = VREAD; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, td); if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = iso_mountfs(devvp, mp, td); } else { if (devvp != imp->im_devvp) error = EINVAL; /* needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return error; } vfs_mountedfrom(mp, fspec); return 0; } /* * Common code for mount and mountroot */ static int iso_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct iso_mnt *isomp = (struct iso_mnt *)0; struct buf *bp = NULL; struct buf *pribp = NULL, *supbp = NULL; struct cdev *dev = devvp->v_rdev; int error = EINVAL; int high_sierra = 0; int iso_bsize; int iso_blknum; int joliet_level; struct iso_volume_descriptor *vdp = 0; struct iso_primary_descriptor *pri = NULL; struct iso_sierra_primary_descriptor *pri_sierra = NULL; struct iso_supplementary_descriptor *sup = NULL; struct iso_directory_record *rootp; int logical_block_size, ssector; struct g_consumer *cp; struct bufobj *bo; char *cs_local, *cs_disk; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "cd9660", 0); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return error; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. */ if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); return (EINVAL); } iso_bsize = cp->provider->sectorsize; joliet_level = 0; if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector)) ssector = 0; for (iso_blknum = 16 + ssector; iso_blknum < 100 + ssector; iso_blknum++) { if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE), iso_bsize, NOCRED, &bp)) != 0) goto out; vdp = (struct iso_volume_descriptor *)bp->b_data; if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { if (bcmp (vdp->id_sierra, ISO_SIERRA_ID, sizeof vdp->id) != 0) { error = EINVAL; goto out; } else high_sierra = 1; } switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){ case ISO_VD_PRIMARY: if (pribp == NULL) { pribp = bp; bp = NULL; pri = (struct iso_primary_descriptor *)vdp; pri_sierra = (struct iso_sierra_primary_descriptor *)vdp; } break; case ISO_VD_SUPPLEMENTARY: if (supbp == NULL) { supbp = bp; bp = NULL; sup = (struct iso_supplementary_descriptor *)vdp; if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) { if (bcmp(sup->escape, "%/@", 3) == 0) joliet_level = 1; if (bcmp(sup->escape, "%/C", 3) == 0) joliet_level = 2; if (bcmp(sup->escape, "%/E", 3) == 0) joliet_level = 3; if ((isonum_711 (sup->flags) & 1) && !vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0)) joliet_level = 0; } } break; case ISO_VD_END: goto vd_end; default: break; } if (bp) { brelse(bp); bp = NULL; } } vd_end: if (bp) { brelse(bp); bp = NULL; } if (pri == NULL) { error = EINVAL; goto out; } logical_block_size = isonum_723 (high_sierra? pri_sierra->logical_block_size: pri->logical_block_size); if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE || (logical_block_size & (logical_block_size - 1)) != 0) { error = EINVAL; goto out; } rootp = (struct iso_directory_record *) (high_sierra? pri_sierra->root_directory_record: pri->root_directory_record); isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK | M_ZERO); isomp->im_cp = cp; isomp->im_bo = bo; isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (high_sierra? pri_sierra->volume_space_size: pri->volume_space_size); isomp->joliet_level = 0; /* * Since an ISO9660 multi-session CD can also access previous * sessions, we have to include them into the space consider- * ations. This doesn't yield a very accurate number since * parts of the old sessions might be inaccessible now, but we * can't do much better. This is also important for the NFS * filehandle validation. */ isomp->volume_space_size += ssector; bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = ffs(logical_block_size) - 1; pribp->b_flags |= B_AGE; brelse(pribp); pribp = NULL; mp->mnt_data = isomp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = 0; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP); vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS); vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT); vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET); vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV); /* Check the Rock Ridge Extension support */ if (!(isomp->im_flags & ISOFSMNT_NORRIP)) { if ((error = bread(isomp->im_devvp, (isomp->root_extent + isonum_711(rootp->ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT), isomp->logical_block_size, NOCRED, &bp)) != 0) goto out; rootp = (struct iso_directory_record *)bp->b_data; if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { isomp->im_flags |= ISOFSMNT_NORRIP; } else { isomp->im_flags &= ~ISOFSMNT_GENS; } /* * The contents are valid, * but they will get reread as part of another vnode, so... */ bp->b_flags |= B_AGE; brelse(bp); bp = NULL; } if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) { cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error); if (error) goto out; cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error); if (error) goto out; cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l); cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d); } else { isomp->im_d2l = NULL; isomp->im_l2d = NULL; } if (high_sierra) { /* this effectively ignores all the mount flags */ if (bootverbose) log(LOG_INFO, "cd9660: High Sierra Format\n"); isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA; } else switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { default: isomp->iso_ftype = ISO_FTYPE_DEFAULT; break; case ISOFSMNT_GENS|ISOFSMNT_NORRIP: isomp->iso_ftype = ISO_FTYPE_9660; break; case 0: if (bootverbose) log(LOG_INFO, "cd9660: RockRidge Extension\n"); isomp->iso_ftype = ISO_FTYPE_RRIP; break; } /* Decide whether to use the Joliet descriptor */ if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) { if (bootverbose) log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n", joliet_level); rootp = (struct iso_directory_record *) sup->root_directory_record; bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->joliet_level = joliet_level; supbp->b_flags |= B_AGE; } if (supbp) { brelse(supbp); supbp = NULL; } return 0; out: if (bp) brelse(bp); if (pribp) brelse(pribp); if (supbp) brelse(supbp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (isomp) { free((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = NULL; } return error; } /* * unmount system call */ static int cd9660_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct iso_mnt *isomp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; #if 0 mntflushbuf(mp, 0); if (mntinvalbuf(mp)) return EBUSY; #endif if ((error = vflush(mp, 0, flags, td))) return (error); isomp = VFSTOISOFS(mp); if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) { if (isomp->im_d2l) cd9660_iconv->close(isomp->im_d2l); if (isomp->im_l2d) cd9660_iconv->close(isomp->im_l2d); } DROP_GIANT(); g_topology_lock(); g_vfs_close(isomp->im_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(isomp->im_devvp); free((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Return root of a filesystem */ static int cd9660_root(mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; ino_t ino = isodirino(dp, imp); /* * With RRIP we must use the `.' entry of the root directory. * Simply tell vget, that it's a relocated directory. */ return (cd9660_vget_internal(mp, ino, LK_EXCLUSIVE, vpp, imp->iso_ftype == ISO_FTYPE_RRIP, dp)); } /* * Get filesystem statistics. */ static int cd9660_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); sbp->f_bsize = isomp->logical_block_size; sbp->f_iosize = sbp->f_bsize; /* XXX */ sbp->f_blocks = isomp->volume_space_size; sbp->f_bfree = 0; /* total free blocks */ sbp->f_bavail = 0; /* blocks free for non superuser */ sbp->f_files = 0; /* total files */ sbp->f_ffree = 0; /* free file nodes */ return 0; } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is in range * - call iget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the generation number matches */ /* ARGSUSED */ static int cd9660_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct ifid *ifhp = (struct ifid *)fhp; struct iso_node *ip; struct vnode *nvp; int error; #ifdef ISOFS_DBG printf("fhtovp: ino %d, start %ld\n", ifhp->ifid_ino, ifhp->ifid_start); #endif if ((error = VFS_VGET(mp, ifhp->ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, ip->i_size, curthread); return (0); } static int cd9660_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { /* * XXXX * It would be nice if we didn't always set the `relocated' flag * and force the extra read, but I don't want to think about fixing * that right now. */ return (cd9660_vget_internal(mp, ino, flags, vpp, #if 0 VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP, #else 0, #endif (struct iso_directory_record *)0)); } int cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int relocated; struct iso_directory_record *isodir; { struct iso_mnt *imp; struct iso_node *ip; struct buf *bp; struct vnode *vp; struct cdev *dev; int error; struct thread *td; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); imp = VFSTOISOFS(mp); dev = imp->im_dev; /* Allocate a new vnode/iso_node. */ if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) { *vpp = NULLVP; return (error); } MALLOC(ip, struct iso_node *, sizeof(struct iso_node), M_ISOFSNODE, M_WAITOK | M_ZERO); vp->v_data = ip; ip->i_vnode = vp; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td); error = insmntque(vp, mp); if (error != 0) { free(ip, M_ISOFSNODE); *vpp = NULLVP; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); if (isodir == 0) { int lbn, off; lbn = lblkno(imp, ino); if (lbn >= imp->volume_space_size) { vput(vp); printf("fhtovp: lbn exceed volume space %d\n", lbn); return (ESTALE); } off = blkoff(imp, ino); if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { vput(vp); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); return (ESTALE); } error = bread(imp->im_devvp, lbn << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { vput(vp); brelse(bp); printf("fhtovp: bread error %d\n",error); return (error); } isodir = (struct iso_directory_record *)(bp->b_data + off); if (off + isonum_711(isodir->length) > imp->logical_block_size) { vput(vp); if (bp != 0) brelse(bp); printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); return (ESTALE); } #if 0 if (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) { if (bp != 0) brelse(bp); printf("fhtovp: file start miss %d vs %d\n", isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length), ifhp->ifid_start); return (ESTALE); } #endif } else bp = 0; ip->i_mnt = imp; VREF(imp->im_devvp); if (relocated) { /* * On relocated directories we must * read the `.' entry out of a dir. */ ip->iso_start = ino >> imp->im_bshift; if (bp != 0) brelse(bp); if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) { vput(vp); return (error); } isodir = (struct iso_directory_record *)bp->b_data; } ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; /* * Setup time stamp, attribute */ vp->v_type = VNON; switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { struct buf *bp2; int off; if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); else bp2 = NULL; cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660); cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660); if (bp2) brelse(bp2); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } if (bp != 0) brelse(bp); /* * Initialize the associated vnode */ switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { case VFIFO: vp->v_op = &cd9660_fifoops; break; default: break; } if (ip->iso_extent == imp->root_extent) vp->v_vflag |= VV_ROOT; /* * XXX need generation number? */ *vpp = vp; return (0); } Index: head/sys/fs/coda/coda_vfsops.c =================================================================== --- head/sys/fs/coda/coda_vfsops.c (revision 175201) +++ head/sys/fs/coda/coda_vfsops.c (revision 175202) @@ -1,521 +1,521 @@ /*- * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ */ /*- * Mach Operating System * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda filesystem at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_CODA, "coda", "Various Coda Structures"); int codadebug = 0; int coda_vfsop_print_entry = 0; #define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__func__)) struct vnode *coda_ctlvp; /* structure to keep statistics of internally generated/satisfied calls */ struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE]; #define MARK_ENTRY(op) (coda_vfsopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++) extern int coda_nc_initialized; /* Set if cache has been initialized */ extern int vc_nb_open(struct cdev *, int, int, struct thread *); int coda_vfsopstats_init(void) { register int i; for (i=0;imnt_optnew, coda_opts)) return (EINVAL); from = vfs_getopts(vfsp->mnt_optnew, "from", &error); if (error) return (error); coda_vfsopstats_init(); coda_vnodeopstats_init(); MARK_ENTRY(CODA_MOUNT_STATS); if (CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(EBUSY); } /* Validate mount device. Similar to getmdev(). */ NDINIT(&ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, from, td); error = namei(&ndp); dvp = ndp.ni_vp; if (error) { MARK_INT_FAIL(CODA_MOUNT_STATS); return (error); } if (dvp->v_type != VCHR) { MARK_INT_FAIL(CODA_MOUNT_STATS); vrele(dvp); NDFREE(&ndp, NDF_ONLY_PNBUF); return(ENXIO); } dev = dvp->v_rdev; vrele(dvp); NDFREE(&ndp, NDF_ONLY_PNBUF); /* * Initialize the mount record and link it to the vfs struct */ mi = dev2coda_mntinfo(dev); if (!mi) { MARK_INT_FAIL(CODA_MOUNT_STATS); printf("Coda mount: %s is not a cfs device\n", from); return(ENXIO); } if (!VC_OPEN(&mi->mi_vcomm)) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENODEV); } /* No initialization (here) of mi_vcomm! */ vfsp->mnt_data = mi; vfs_getnewfsid (vfsp); mi->mi_vfsp = vfsp; mi->mi_started = 0; /* XXX See coda_root() */ /* * Make a root vnode to placate the Vnode interface, but don't * actually make the CODA_ROOT call to venus until the first call * to coda_root in case a server is down while venus is starting. */ cp = make_coda_node(&rootfid, vfsp, VDIR); rootvp = CTOV(cp); rootvp->v_vflag |= VV_ROOT; cp = make_coda_node(&ctlfid, vfsp, VREG); coda_ctlvp = CTOV(cp); /* Add vfs and rootvp to chain of vfs hanging off mntinfo */ mi->mi_vfsp = vfsp; mi->mi_rootvp = rootvp; vfs_mountedfrom(vfsp, from); /* error is currently guaranteed to be zero, but in case some code changes... */ CODADEBUG(1, myprintf(("coda_omount returned %d\n",error));); if (error) MARK_INT_FAIL(CODA_MOUNT_STATS); else MARK_INT_SAT(CODA_MOUNT_STATS); return(error); } int coda_unmount(vfsp, mntflags, td) struct mount *vfsp; int mntflags; struct thread *td; { struct coda_mntinfo *mi = vftomi(vfsp); int active, error = 0; ENTRY; MARK_ENTRY(CODA_UMOUNT_STATS); if (!CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_UMOUNT_STATS); return(EINVAL); } if (mi->mi_vfsp == vfsp) { /* We found the victim */ if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp))) return (EBUSY); /* Venus is still running */ #ifdef DEBUG printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp)); #endif vrele(mi->mi_rootvp); vrele(coda_ctlvp); active = coda_kill(vfsp, NOT_DOWNCALL); ASSERT_VOP_LOCKED(mi->mi_rootvp, "coda_unmount"); mi->mi_rootvp->v_vflag &= ~VV_ROOT; error = vflush(mi->mi_vfsp, 0, FORCECLOSE, td); #ifdef CODA_VERBOSE printf("coda_unmount: active = %d, vflush active %d\n", active, error); #endif error = 0; /* I'm going to take this out to allow lookups to go through. I'm * not sure it's important anyway. -- DCS 2/2/94 */ /* vfsp->VFS_DATA = NULL; */ /* No more vfsp's to hold onto */ mi->mi_vfsp = NULL; mi->mi_rootvp = NULL; if (error) MARK_INT_FAIL(CODA_UMOUNT_STATS); else MARK_INT_SAT(CODA_UMOUNT_STATS); return(error); } return (EINVAL); } /* * find root of cfs */ int coda_root(vfsp, flags, vpp, td) struct mount *vfsp; int flags; struct vnode **vpp; struct thread *td; { struct coda_mntinfo *mi = vftomi(vfsp); struct vnode **result; int error; struct proc *p = td->td_proc; CodaFid VFid; static const CodaFid invalfid = INVAL_FID; ENTRY; MARK_ENTRY(CODA_ROOT_STATS); result = NULL; if (vfsp == mi->mi_vfsp) { /* * Cache the root across calls. We only need to pass the request * on to Venus if the root vnode is the dummy we installed in * coda_omount() with all c_fid members zeroed. * * XXX In addition, we assume that the first call to coda_root() * is from vfs_omount() * (before the call to checkdirs()) and return the dummy root * node to avoid a deadlock. This bug is fixed in the Coda CVS * repository but not in any released versions as of 6 Mar 2003. */ if (memcmp(&VTOC(mi->mi_rootvp)->c_fid, &invalfid, sizeof(CodaFid)) != 0 || mi->mi_started == 0) { /* Found valid root. */ *vpp = mi->mi_rootvp; mi->mi_started = 1; /* On Mach, this is vref. On NetBSD, VOP_LOCK */ #if 1 vref(*vpp); - vn_lock(*vpp, LK_EXCLUSIVE, td); + vn_lock(*vpp, LK_EXCLUSIVE); #else vget(*vpp, LK_EXCLUSIVE, td); #endif MARK_INT_SAT(CODA_ROOT_STATS); return(0); } } error = venus_root(vftomi(vfsp), td->td_ucred, p, &VFid); if (!error) { /* * Save the new rootfid in the cnode, and rehash the cnode into the * cnode hash with the new fid key. */ coda_unsave(VTOC(mi->mi_rootvp)); VTOC(mi->mi_rootvp)->c_fid = VFid; coda_save(VTOC(mi->mi_rootvp)); *vpp = mi->mi_rootvp; #if 1 vref(*vpp); - vn_lock(*vpp, LK_EXCLUSIVE, td); + vn_lock(*vpp, LK_EXCLUSIVE); #else vget(*vpp, LK_EXCLUSIVE, td); #endif MARK_INT_SAT(CODA_ROOT_STATS); goto exit; } else if (error == ENODEV || error == EINTR) { /* Gross hack here! */ /* * If Venus fails to respond to the CODA_ROOT call, coda_call returns * ENODEV. Return the uninitialized root vnode to allow vfs * operations such as unmount to continue. Without this hack, * there is no way to do an unmount if Venus dies before a * successful CODA_ROOT call is done. All vnode operations * will fail. */ *vpp = mi->mi_rootvp; #if 1 vref(*vpp); - vn_lock(*vpp, LK_EXCLUSIVE, td); + vn_lock(*vpp, LK_EXCLUSIVE); #else vget(*vpp, LK_EXCLUSIVE, td); #endif MARK_INT_FAIL(CODA_ROOT_STATS); error = 0; goto exit; } else { CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); ); MARK_INT_FAIL(CODA_ROOT_STATS); goto exit; } exit: return(error); } /* * Get filesystem statistics. */ int coda_nb_statfs(vfsp, sbp, td) register struct mount *vfsp; struct statfs *sbp; struct thread *td; { ENTRY; /* MARK_ENTRY(CODA_STATFS_STATS); */ if (!CODA_MOUNTED(vfsp)) { /* MARK_INT_FAIL(CODA_STATFS_STATS);*/ return(EINVAL); } bzero(sbp, sizeof(struct statfs)); /* XXX - what to do about f_flags, others? --bnoble */ /* Below This is what AFS does #define NB_SFS_SIZ 0x895440 */ /* Note: Normal fs's have a bsize of 0x400 == 1024 */ sbp->f_type = vfsp->mnt_vfc->vfc_typenum; sbp->f_bsize = 8192; /* XXX */ sbp->f_iosize = 8192; /* XXX */ #define NB_SFS_SIZ 0x8AB75D sbp->f_blocks = NB_SFS_SIZ; sbp->f_bfree = NB_SFS_SIZ; sbp->f_bavail = NB_SFS_SIZ; sbp->f_files = NB_SFS_SIZ; sbp->f_ffree = NB_SFS_SIZ; bcopy((caddr_t)&(vfsp->mnt_stat.f_fsid), (caddr_t)&(sbp->f_fsid), sizeof (fsid_t)); snprintf(sbp->f_mntonname, sizeof(sbp->f_mntonname), "/coda"); snprintf(sbp->f_fstypename, sizeof(sbp->f_fstypename), "coda"); /* MARK_INT_SAT(CODA_STATFS_STATS); */ return(0); } /* * Flush any pending I/O. */ int coda_sync(vfsp, waitfor, td) struct mount *vfsp; int waitfor; struct thread *td; { ENTRY; MARK_ENTRY(CODA_SYNC_STATS); MARK_INT_SAT(CODA_SYNC_STATS); return(0); } /* * fhtovp is now what vget used to be in 4.3-derived systems. For * some silly reason, vget is now keyed by a 32 bit ino_t, rather than * a type-specific fid. */ int coda_fhtovp(vfsp, fhp, nam, vpp, exflagsp, creadanonp) register struct mount *vfsp; struct fid *fhp; struct mbuf *nam; struct vnode **vpp; int *exflagsp; struct ucred **creadanonp; { struct cfid *cfid = (struct cfid *)fhp; struct cnode *cp = 0; int error; struct thread *td = curthread; /* XXX -mach */ struct proc *p = td->td_proc; CodaFid VFid; int vtype; ENTRY; MARK_ENTRY(CODA_VGET_STATS); /* Check for vget of control object. */ if (IS_CTL_FID(&cfid->cfid_fid)) { *vpp = coda_ctlvp; vref(coda_ctlvp); MARK_INT_SAT(CODA_VGET_STATS); return(0); } error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, td->td_ucred, p, &VFid, &vtype); if (error) { CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));) *vpp = (struct vnode *)0; } else { CODADEBUG(CODA_VGET, myprintf(("vget: %s type %d result %d\n", coda_f2s(&VFid), vtype, error)); ) cp = make_coda_node(&VFid, vfsp, vtype); *vpp = CTOV(cp); } return(error); } /* * To allow for greater ease of use, some vnodes may be orphaned when * Venus dies. Certain operations should still be allowed to go * through, but without propagating ophan-ness. So this function will * get a new vnode for the file from the current run of Venus. */ int getNewVnode(vpp) struct vnode **vpp; { struct cfid cfid; struct coda_mntinfo *mi = vftomi((*vpp)->v_mount); ENTRY; cfid.cfid_len = (short)sizeof(CodaFid); cfid.cfid_fid = VTOC(*vpp)->c_fid; /* Structure assignment. */ /* XXX ? */ /* We're guessing that if set, the 1st element on the list is a * valid vnode to use. If not, return ENODEV as venus is dead. */ if (mi->mi_vfsp == NULL) return ENODEV; return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp, NULL, NULL); } #include #include #include /* get the mount structure corresponding to a given device. Assume * device corresponds to a UFS. Return NULL if no device is found. */ struct mount *devtomp(dev) struct cdev *dev; { struct mount *mp; TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (((VFSTOUFS(mp))->um_dev == dev)) { /* mount corresponds to UFS and the device matches one we want */ return(mp); } } /* mount structure wasn't found */ return(NULL); } struct vfsops coda_vfsops = { .vfs_mount = coda_mount, .vfs_root = coda_root, .vfs_statfs = coda_nb_statfs, .vfs_sync = coda_sync, .vfs_unmount = coda_unmount, }; VFS_SET(coda_vfsops, coda, VFCF_NETWORK); Index: head/sys/fs/coda/coda_vnops.c =================================================================== --- head/sys/fs/coda/coda_vnops.c (revision 175201) +++ head/sys/fs/coda/coda_vnops.c (revision 175202) @@ -1,1743 +1,1743 @@ /*- * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda filesystem at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These flags select various performance enhancements. */ int coda_attr_cache = 1; /* Set to cache attributes in the kernel */ int coda_symlink_cache = 1; /* Set to cache symbolic link information */ int coda_access_cache = 1; /* Set to handle some access checks directly */ /* structure to keep track of vfs calls */ struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE]; #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++) /* What we are delaying for in printf */ int coda_printf_delay = 0; /* in microseconds */ int coda_vnop_print_entry = 0; static int coda_lockdebug = 0; /* * Some NetBSD details: * * coda_start is called at the end of the mount syscall. * coda_init is called at boot time. */ #define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__)) /* Definition of the vnode operation vector */ struct vop_vector coda_vnodeops = { .vop_default = VOP_PANIC, .vop_lookup = coda_lookup, /* lookup */ .vop_create = coda_create, /* create */ .vop_mknod = VOP_PANIC, /* mknod */ .vop_open = coda_open, /* open */ .vop_close = coda_close, /* close */ .vop_access = coda_access, /* access */ .vop_getattr = coda_getattr, /* getattr */ .vop_setattr = coda_setattr, /* setattr */ .vop_read = coda_read, /* read */ .vop_write = coda_write, /* write */ .vop_ioctl = coda_ioctl, /* ioctl */ .vop_fsync = coda_fsync, /* fsync */ .vop_remove = coda_remove, /* remove */ .vop_link = coda_link, /* link */ .vop_rename = coda_rename, /* rename */ .vop_mkdir = coda_mkdir, /* mkdir */ .vop_rmdir = coda_rmdir, /* rmdir */ .vop_symlink = coda_symlink, /* symlink */ .vop_readdir = coda_readdir, /* readdir */ .vop_readlink = coda_readlink, /* readlink */ .vop_inactive = coda_inactive, /* inactive */ .vop_reclaim = coda_reclaim, /* reclaim */ .vop_lock1 = coda_lock, /* lock */ .vop_unlock = coda_unlock, /* unlock */ .vop_bmap = coda_bmap, /* bmap */ .vop_print = VOP_PANIC, /* print */ .vop_islocked = coda_islocked, /* islocked */ .vop_pathconf = coda_pathconf, /* pathconf */ .vop_advlock = VOP_NULL, /* advlock */ .vop_lease = VOP_NULL, /* lease */ .vop_poll = vop_stdpoll, .vop_getpages = vop_stdgetpages, /* pager intf.*/ .vop_putpages = vop_stdputpages, /* pager intf.*/ .vop_getwritemount = vop_stdgetwritemount, #if 0 missing .vop_cachedlookup = ufs_lookup, .vop_whiteout = ufs_whiteout, #endif }; /* A generic do-nothing. For lease_check, advlock */ int coda_vop_nop(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("Vnode operation %s called, but unsupported\n", (*desc)->vdesc_name)); } return (0); } int coda_vnodeopstats_init(void) { register int i; for(i=0;ia_vp); struct cnode *cp = VTOC(*vpp); int flag = ap->a_mode & (~O_EXCL); struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; struct vnode *vp; MARK_ENTRY(CODA_OPEN_STATS); /* Check for open of control file. */ if (IS_CTL_VP(*vpp)) { /* XXX */ /* if (WRITEABLE(flag)) */ if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) { MARK_INT_FAIL(CODA_OPEN_STATS); return(EACCES); } MARK_INT_SAT(CODA_OPEN_STATS); return(0); } error = venus_open(vtomi((*vpp)), &cp->c_fid, flag, cred, td->td_proc, &vp); if (error) return (error); CODADEBUG( CODA_OPEN,myprintf(("open: vp %p result %d\n", vp, error));) /* Keep a reference until the close comes in. */ vref(*vpp); /* Save the vnode pointer for the cache file. */ if (cp->c_ovp == NULL) { cp->c_ovp = vp; } else { if (cp->c_ovp != vp) panic("coda_open: cp->c_ovp != ITOV(ip)"); } cp->c_ocount++; /* Flush the attribute cached if writing the file. */ if (flag & FWRITE) { cp->c_owrite++; cp->c_flags &= ~C_VATTR; } /* Open the cache file. */ error = VOP_OPEN(vp, flag, cred, td, NULL); if (error) { printf("coda_open: VOP_OPEN on container failed %d\n", error); return (error); } /* grab (above) does this when it calls newvnode unless it's in the cache*/ return(error); } /* * Close the cache file used for I/O and notify Venus. */ int coda_close(struct vop_close_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_CLOSE_STATS); /* Check for close of control file. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_CLOSE_STATS); return(0); } if (cp->c_ovp) { VOP_CLOSE(cp->c_ovp, flag, cred, td); /* Do errors matter here? */ vrele(cp->c_ovp); } #ifdef CODA_VERBOSE else printf("coda_close: NO container vp %p/cp %p\n", vp, cp); #endif if (--cp->c_ocount == 0) cp->c_ovp = NULL; if (flag & FWRITE) /* file was opened for write */ --cp->c_owrite; if (!IS_UNMOUNTING(cp)) error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, td->td_proc); else error = ENODEV; vrele(vp); CODADEBUG(CODA_CLOSE, myprintf(("close: result %d\n",error)); ) return(error); } int coda_read(struct vop_read_args *ap) { ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td)); } int coda_write(struct vop_write_args *ap) { ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td)); } int coda_rdwr(struct vnode *vp, struct uio *uiop, enum uio_rw rw, int ioflag, struct ucred *cred, struct thread *td) { /* upcall decl */ /* NOTE: container file operation!!! */ /* locals */ struct cnode *cp = VTOC(vp); struct vnode *cfvp = cp->c_ovp; int opened_internally = 0; int error = 0; MARK_ENTRY(CODA_RDWR_STATS); CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %d, %lld, %d)\n", rw, (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for rdwr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_RDWR_STATS); return(EINVAL); } /* * If file is not already open this must be a page {read,write} request * and we should open it internally. */ if (cfvp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td, NULL); printf("coda_rdwr: Internally Opening %p\n", vp); if (error) { printf("coda_rdwr: VOP_OPEN on container failed %d\n", error); return (error); } cfvp = cp->c_ovp; } /* Have UFS handle the call. */ CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), CTOV(cp)->v_usecount)); ) if (rw == UIO_READ) { error = VOP_READ(cfvp, uiop, ioflag, cred); } else { error = VOP_WRITE(cfvp, uiop, ioflag, cred); /* ufs_write updates the vnode_pager_setsize for the vnode/object */ { struct vattr attr; if (VOP_GETATTR(cfvp, &attr, cred, td) == 0) { vnode_pager_setsize(vp, attr.va_size); } } } if (error) MARK_INT_FAIL(CODA_RDWR_STATS); else MARK_INT_SAT(CODA_RDWR_STATS); /* Do an internal close if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td); } /* Invalidate cached attributes if writing. */ if (rw == UIO_WRITE) cp->c_flags &= ~C_VATTR; return(error); } int coda_ioctl(struct vop_ioctl_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; int com = ap->a_command; caddr_t data = ap->a_data; int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; struct vnode *tvp; struct nameidata ndp; struct PioctlData *iap = (struct PioctlData *)data; MARK_ENTRY(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));) /* Don't check for operation on a dying object, for ctlvp it shouldn't matter */ /* Must be control object to succeed. */ if (!IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: vp != ctlvp"));) return (EOPNOTSUPP); } /* Look up the pathname. */ /* Should we use the name cache here? It would get it from lookupname sooner or later anyway, right? */ NDINIT(&ndp, LOOKUP, (iap->follow ? FOLLOW : NOFOLLOW), UIO_USERSPACE, iap->path, td); error = namei(&ndp); tvp = ndp.ni_vp; if (error) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: lookup returns %d\n", error));) return(error); } /* * Make sure this is a coda style cnode, but it may be a * different vfsp */ if (tvp->v_op != &coda_vnodeops) { vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: %s not a coda object\n", iap->path));) return(EINVAL); } if (iap->vi.in_size > VC_MAXDATASIZE) { NDFREE(&ndp, 0); return(EINVAL); } error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, td->td_proc); if (error) MARK_INT_FAIL(CODA_IOCTL_STATS); else CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); ) vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); return(error); } /* * To reduce the cost of a user-level venus;we cache attributes in * the kernel. Each cnode has storage allocated for an attribute. If * c_vattr is valid, return a reference to it. Otherwise, get the * attributes from venus and store them in the cnode. There is some * question if this method is a security leak. But I think that in * order to make this call, the user must have done a lookup and * opened the file, and therefore should already have access. */ int coda_getattr(struct vop_getattr_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_GETATTR_STATS); if (IS_UNMOUNTING(cp)) return ENODEV; /* Check for getattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_GETATTR_STATS); return(ENOENT); } /* Check to see if the attributes have already been cached */ if (VALID_VATTR(cp)) { CODADEBUG(CODA_GETATTR, { myprintf(("attr cache hit: %s\n", coda_f2s(&cp->c_fid)));}); CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(&cp->c_vattr); ); *vap = cp->c_vattr; MARK_INT_SAT(CODA_GETATTR_STATS); return(0); } error = venus_getattr(vtomi(vp), &cp->c_fid, cred, td->td_proc, vap); if (!error) { CODADEBUG(CODA_GETATTR, myprintf(("getattr miss %s: result %d\n", coda_f2s(&cp->c_fid), error)); ) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(vap); ); { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } /* If not open for write, store attributes in cnode */ if ((cp->c_owrite == 0) && (coda_attr_cache)) { cp->c_vattr = *vap; cp->c_flags |= C_VATTR; } } return(error); } int coda_setattr(struct vop_setattr_args *ap) { /* true args */ register struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_SETATTR_STATS); /* Check for setattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_SETATTR_STATS); return(ENOENT); } if (codadebug & CODADBGMSK(CODA_SETATTR)) { print_vattr(vap); } error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, td->td_proc); if (!error) cp->c_flags &= ~C_VATTR; { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (size != VNOVAL && convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); ) return(error); } int coda_access(struct vop_access_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int mode = ap->a_mode; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_ACCESS_STATS); /* Check for access of control object. Only read access is allowed on it. */ if (IS_CTL_VP(vp)) { /* bogus hack - all will be marked as successes */ MARK_INT_SAT(CODA_ACCESS_STATS); return(((mode & VREAD) && !(mode & (VWRITE | VEXEC))) ? 0 : EACCES); } /* * if the file is a directory, and we are checking exec (eg lookup) * access, and the file is in the namecache, then the user must have * lookup access to it. */ if (coda_access_cache) { if ((vp->v_type == VDIR) && (mode & VEXEC)) { if (coda_nc_lookup(cp, ".", 1, cred)) { MARK_INT_SAT(CODA_ACCESS_STATS); return(0); /* it was in the cache */ } } } error = venus_access(vtomi(vp), &cp->c_fid, mode, cred, td->td_proc); return(error); } int coda_readlink(struct vop_readlink_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_uio->uio_td; /* locals */ int error; char *str; int len; MARK_ENTRY(CODA_READLINK_STATS); /* Check for readlink of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READLINK_STATS); return(ENOENT); } if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */ uiop->uio_rw = UIO_READ; error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop); if (error) MARK_INT_FAIL(CODA_READLINK_STATS); else MARK_INT_SAT(CODA_READLINK_STATS); return(error); } error = venus_readlink(vtomi(vp), &cp->c_fid, cred, td != NULL ? td->td_proc : NULL, &str, &len); if (!error) { uiop->uio_rw = UIO_READ; error = uiomove(str, len, uiop); if (coda_symlink_cache) { cp->c_symlink = str; cp->c_symlen = len; cp->c_flags |= C_SYMLINK; } else CODA_FREE(str, len); } CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));) return(error); } int coda_fsync(struct vop_fsync_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct thread *td = ap->a_td; /* locals */ struct vnode *convp = cp->c_ovp; int error; MARK_ENTRY(CODA_FSYNC_STATS); /* Check for fsync on an unmounting object */ /* The NetBSD kernel, in it's infinite wisdom, can try to fsync * after an unmount has been initiated. This is a Bad Thing, * which we have to avoid. Not a legitimate failure for stats. */ if (IS_UNMOUNTING(cp)) { return(ENODEV); } /* Check for fsync of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_FSYNC_STATS); return(0); } if (convp) VOP_FSYNC(convp, MNT_WAIT, td); /* * We see fsyncs with usecount == 1 then usecount == 0. * For now we ignore them. */ /* VI_LOCK(vp); if (!vp->v_usecount) { printf("coda_fsync on vnode %p with %d usecount. c_flags = %x (%x)\n", vp, vp->v_usecount, cp->c_flags, cp->c_flags&C_PURGING); } VI_UNLOCK(vp); */ /* * We can expect fsync on any vnode at all if venus is pruging it. * Venus can't very well answer the fsync request, now can it? * Hopefully, it won't have to, because hopefully, venus preserves * the (possibly untrue) invariant that it never purges an open * vnode. Hopefully. */ if (cp->c_flags & C_PURGING) { return(0); } /* needs research */ return 0; error = venus_fsync(vtomi(vp), &cp->c_fid, td->td_proc); CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); ); return(error); } int coda_inactive(struct vop_inactive_args *ap) { /* XXX - at the moment, inactive doesn't look at cred, and doesn't have a proc pointer. Oops. */ /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred __attribute__((unused)) = NULL; struct thread *td __attribute__((unused)) = curthread; /* upcall decl */ /* locals */ /* We don't need to send inactive to venus - DCS */ MARK_ENTRY(CODA_INACTIVE_STATS); CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n", coda_f2s(&cp->c_fid), vp->v_mount));) /* If an array has been allocated to hold the symlink, deallocate it */ if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { if (cp->c_symlink == NULL) panic("coda_inactive: null symlink pointer in cnode"); CODA_FREE(cp->c_symlink, cp->c_symlen); cp->c_flags &= ~C_SYMLINK; cp->c_symlen = 0; } /* Remove it from the table so it can't be found. */ coda_unsave(cp); if ((struct coda_mntinfo *)(vp->v_mount->mnt_data) == NULL) { myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp)); panic("badness in coda_inactive\n"); } if (IS_UNMOUNTING(cp)) { #ifdef DEBUG printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp); if (cp->c_ovp != NULL) printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp); #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(CTOV(cp))) { panic("coda_inactive: nonzero reference count"); } if (cp->c_ovp != NULL) { panic("coda_inactive: cp->ovp != NULL"); } #endif vgone(vp); } MARK_INT_SAT(CODA_INACTIVE_STATS); return(0); } /* * Remote filesystem operations having to do with directory manipulation. */ /* * It appears that in NetBSD, lookup is supposed to return the vnode locked */ int coda_lookup(struct vop_lookup_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vnode **vpp = ap->a_vpp; /* * It looks as though ap->a_cnp->ni_cnd->cn_nameptr holds the rest * of the string to xlate, and that we must try to get at least * ap->a_cnp->ni_cnd->cn_namelen of those characters to macth. I * could be wrong. */ struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; int vtype; int error = 0; MARK_ENTRY(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s in %s\n", nm, coda_f2s(&dcp->c_fid)));); /* Check for lookup of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = coda_ctlvp; vref(*vpp); MARK_INT_SAT(CODA_LOOKUP_STATS); goto exit; } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("name too long: lookup, %s (%s)\n", coda_f2s(&dcp->c_fid), nm));); *vpp = (struct vnode *)0; error = EINVAL; goto exit; } /* First try to look the file up in the cfs name cache */ /* lock the parent vnode? */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) { *vpp = CTOV(cp); vref(*vpp); CODADEBUG(CODA_LOOKUP, myprintf(("lookup result %d vpp %p\n",error,*vpp));) } else { /* The name wasn't cached, so we need to contact Venus */ error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc, &VFid, &vtype); if (error) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup error on %s (%s)%d\n", coda_f2s(&dcp->c_fid), nm, error));) *vpp = (struct vnode *)0; } else { MARK_INT_SAT(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s type %o result %d\n", coda_f2s(&VFid), vtype, error)); ) cp = make_coda_node(&VFid, dvp->v_mount, vtype); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache only if the top bit isn't set */ /* And don't enter a new vnode for an invalid one! */ if (!(vtype & CODA_NOCACHE)) coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); } } exit: /* * If we are creating, and this was the last name to be looked up, * and the error was ENOENT, then there really shouldn't be an * error and we can make the leaf NULL and return success. Since * this is supposed to work under Mach as well as NetBSD, we're * leaving this fn wrapped. We also must tell lookup/namei that * we need to save the last component of the name. (Create will * have to free the name buffer later...lucky us...) */ if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (cnp->cn_flags & ISLASTCN) && (error == ENOENT)) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; *ap->a_vpp = NULL; } /* * If we are removing, and we are at the last element, and we * found it, then we need to keep the name around so that the * removal will go ahead as planned. Unfortunately, this will * probably also lock the to-be-removed vnode, which may or may * not be a good idea. I'll have to look at the bits of * coda_remove to make sure. We'll only save the name if we did in * fact find the name, otherwise coda_remove won't have a chance * to free the pathname. */ if ((cnp->cn_nameiop == DELETE) && (cnp->cn_flags & ISLASTCN) && !error) { cnp->cn_flags |= SAVENAME; } /* * If the lookup went well, we need to (potentially?) unlock the * parent, and lock the child. We are only responsible for * checking to see if the parent is supposed to be unlocked before * we return. We must always lock the child (provided there is * one, and (the parent isn't locked or it isn't the same as the * parent.) Simple, huh? We can never leave the parent locked unless * we are ISLASTCN */ if (!error || (error == EJUSTRETURN)) { if (cnp->cn_flags & ISDOTDOT) { if ((error = VOP_UNLOCK(dvp, 0, td))) { return error; } /* * The parent is unlocked. As long as there is a child, * lock it without bothering to check anything else. */ if (*ap->a_vpp) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { - vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td); + vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE); return (error); } } - vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td); + vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE); } else { /* The parent is locked, and may be the same as the child */ if (*ap->a_vpp && (*ap->a_vpp != dvp)) { /* Different, go ahead and lock it. */ if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { return (error); } } } } else { /* If the lookup failed, we need to ensure that the leaf is NULL */ /* Don't change any locking? */ *ap->a_vpp = NULL; } return(error); } /*ARGSUSED*/ int coda_create(struct vop_create_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vattr *va = ap->a_vap; int exclusive = 1; int mode = ap->a_vap->va_mode; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; struct vattr attr; MARK_ENTRY(CODA_CREATE_STATS); /* All creates are exclusive XXX */ /* I'm assuming the 'mode' argument is the file mode bits XXX */ /* Check for create of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_CREATE_STATS); return(EACCES); } error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, td->td_proc, &VFid, &attr); if (!error) { /* If this is an exclusive create, panic if the file already exists. */ /* Venus should have detected the file and reported EEXIST. */ if ((exclusive == 1) && (coda_find(&VFid) != NULL)) panic("cnode existed for newly created file!"); cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type); *vpp = CTOV(cp); /* Update va to reflect the new attributes. */ (*va) = attr; /* Update the attribute cache and mark it as valid */ if (coda_attr_cache) { VTOC(*vpp)->c_vattr = attr; VTOC(*vpp)->c_flags |= C_VATTR; } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); CODADEBUG(CODA_CREATE, myprintf(("create: %s, result %d\n", coda_f2s(&VFid), error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_CREATE, myprintf(("create error %d\n", error));) } if (!error) { if (cnp->cn_flags & LOCKLEAF) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { printf("coda_create: "); panic("unlocked parent but couldn't lock child"); } } #ifdef OLD_DIAGNOSTIC else { printf("coda_create: LOCKLEAF not set!\n"); } #endif } return(error); } int coda_remove(struct vop_remove_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *cp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *tp; MARK_ENTRY(CODA_REMOVE_STATS); CODADEBUG(CODA_REMOVE, myprintf(("remove: %s in %s\n", nm, coda_f2s(&cp->c_fid)));); /* Remove the file's entry from the CODA Name Cache */ /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* I'm gonna go out on a limb here. If a file and a hardlink to it * exist, and one is removed, the link count on the other will be * off by 1. We could either invalidate the attrs if cached, or * fix them. I'll try to fix them. DCS 11/8/94 */ tp = coda_nc_lookup(VTOC(dvp), nm, len, cred); if (tp) { if (VALID_VATTR(tp)) { /* If attrs are cached */ if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */ tp->c_vattr.va_nlink--; } } coda_nc_zapfile(VTOC(dvp), nm, len); /* No need to flush it if it doesn't exist! */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Check for remove of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_REMOVE_STATS); return(ENOENT); } error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, td->td_proc); CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); ) return(error); } int coda_link(struct vop_link_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vnode *tdvp = ap->a_tdvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; MARK_ENTRY(CODA_LINK_STATS); if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("nb_link: vp fid: %s\n", coda_f2s(&cp->c_fid))); myprintf(("nb_link: tdvp fid: %s)\n", coda_f2s(&tdcp->c_fid))); } if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("link: vp fid: %s\n", coda_f2s(&cp->c_fid))); myprintf(("link: tdvp fid: %s\n", coda_f2s(&tdcp->c_fid))); } /* Check for link to/from control object. */ if (IS_CTL_NAME(tdvp, nm, len) || IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_LINK_STATS); return(EACCES); } error = venus_link(vtomi(vp), &cp->c_fid, &tdcp->c_fid, nm, len, cred, td->td_proc); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(tdvp)->c_flags &= ~C_VATTR; VTOC(vp)->c_flags &= ~C_VATTR; CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); ) return(error); } int coda_rename(struct vop_rename_args *ap) { /* true args */ struct vnode *odvp = ap->a_fdvp; struct cnode *odcp = VTOC(odvp); struct componentname *fcnp = ap->a_fcnp; struct vnode *ndvp = ap->a_tdvp; struct cnode *ndcp = VTOC(ndvp); struct componentname *tcnp = ap->a_tcnp; struct ucred *cred = fcnp->cn_cred; struct thread *td = fcnp->cn_thread; /* true args */ int error; const char *fnm = fcnp->cn_nameptr; int flen = fcnp->cn_namelen; const char *tnm = tcnp->cn_nameptr; int tlen = tcnp->cn_namelen; MARK_ENTRY(CODA_RENAME_STATS); /* Hmmm. The vnodes are already looked up. Perhaps they are locked? This could be Bad. XXX */ #ifdef OLD_DIAGNOSTIC if ((fcnp->cn_cred != tcnp->cn_cred) || (fcnp->cn_thread != tcnp->cn_thread)) { panic("coda_rename: component names don't agree"); } #endif /* Check for rename involving control object. */ if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) { MARK_INT_FAIL(CODA_RENAME_STATS); return(EACCES); } /* Problem with moving directories -- need to flush entry for .. */ if (odvp != ndvp) { struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred); if (ovcp) { struct vnode *ovp = CTOV(ovcp); if ((ovp) && (ovp->v_type == VDIR)) /* If it's a directory */ coda_nc_zapfile(VTOC(ovp),"..", 2); } } /* Remove the entries for both source and target files */ coda_nc_zapfile(VTOC(odvp), fnm, flen); coda_nc_zapfile(VTOC(ndvp), tnm, tlen); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(odvp)->c_flags &= ~C_VATTR; VTOC(ndvp)->c_flags &= ~C_VATTR; if (flen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } if (tlen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, td->td_proc); exit: CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));) /* XXX - do we need to call cache pureg on the moved vnode? */ cache_purge(ap->a_fvp); /* Release parents first, then children. */ vrele(odvp); if (ap->a_tvp) { if (ap->a_tvp == ndvp) vrele(ndvp); else vput(ndvp); vput(ap->a_tvp); } else vput(ndvp); vrele(ap->a_fvp); return(error); } int coda_mkdir(struct vop_mkdir_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; register struct vattr *va = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; CodaFid VFid; struct vattr ova; MARK_ENTRY(CODA_MKDIR_STATS); /* Check for mkdir of target object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } if (len+1 > CODA_MAXNAMLEN) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, td->td_proc, &VFid, &ova); if (!error) { if (coda_find(&VFid) != NULL) panic("cnode existed for newly created directory!"); cp = make_coda_node(&VFid, dvp->v_mount, va->va_type); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); /* as a side effect, enter "." and ".." for the directory */ coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp)); coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp)); if (coda_attr_cache) { VTOC(*vpp)->c_vattr = ova; /* update the attr cache */ VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; CODADEBUG( CODA_MKDIR, myprintf(("mkdir: %s result %d\n", coda_f2s(&VFid), error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_MKDIR, myprintf(("mkdir error %d\n",error));) } return(error); } int coda_rmdir(struct vop_rmdir_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* true args */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; MARK_ENTRY(CODA_RMDIR_STATS); /* Check for rmdir of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_RMDIR_STATS); return(ENOENT); } /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* * As a side effect of the rmdir, remove any entries for children of * the directory, especially "." and "..". */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL); /* Remove the file's entry from the CODA Name Cache */ coda_nc_zapfile(dcp, nm, len); /* Invalidate the parent's attr cache, the modification time has changed */ dcp->c_flags &= ~C_VATTR; error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc); CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); ) return(error); } int coda_symlink(struct vop_symlink_args *ap) { /* true args */ struct vnode *tdvp = ap->a_dvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct vattr *tva = ap->a_vap; char *path = ap->a_target; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; struct vnode **vpp = ap->a_vpp; /* locals */ int error; /* * XXX I'm assuming the following things about coda_symlink's * arguments: * t(foo) is the new name/parent/etc being created. * lname is the contents of the new symlink. */ char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; int plen = strlen(path); /* * Here's the strategy for the moment: perform the symlink, then * do a lookup to grab the resulting vnode. I know this requires * two communications with Venus for a new sybolic link, but * that's the way the ball bounces. I don't yet want to change * the way the Mach symlink works. When Mach support is * deprecated, we should change symlink so that the common case * returns the resultant vnode in a vpp argument. */ MARK_ENTRY(CODA_SYMLINK_STATS); /* Check for symlink of control object. */ if (IS_CTL_NAME(tdvp, nm, len)) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EACCES); } if (plen+1 > CODA_MAXPATHLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EINVAL); } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } error = venus_symlink(vtomi(tdvp), &tdcp->c_fid, path, plen, nm, len, tva, cred, td->td_proc); /* Invalidate the parent's attr cache, the modification time has changed */ tdcp->c_flags &= ~C_VATTR; if (error == 0) error = VOP_LOOKUP(tdvp, vpp, cnp); exit: CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); ) return(error); } /* * Read directory entries. */ int coda_readdir(struct vop_readdir_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; int *eofflag = ap->a_eofflag; u_long **cookies = ap->a_cookies; int *ncookies = ap->a_ncookies; struct thread *td = ap->a_uio->uio_td; /* upcall decl */ /* locals */ int error = 0; MARK_ENTRY(CODA_READDIR_STATS); CODADEBUG(CODA_READDIR, myprintf(("coda_readdir(%p, %d, %lld, %d)\n", (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for readdir of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READDIR_STATS); return(ENOENT); } { /* If directory is not already open do an "internal open" on it. */ int opened_internally = 0; if (cp->c_ovp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, FREAD, cred, td, NULL); printf("coda_readdir: Internally Opening %p\n", vp); if (error) { printf("coda_readdir: VOP_OPEN on container failed %d\n", error); return (error); } } /* Have UFS handle the call. */ CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), vp->v_usecount)); ) error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, ncookies, cookies); if (error) MARK_INT_FAIL(CODA_READDIR_STATS); else MARK_INT_SAT(CODA_READDIR_STATS); /* Do an "internal close" if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, FREAD, cred, td); } } return(error); } /* * Convert from filesystem blocks to device blocks */ int coda_bmap(struct vop_bmap_args *ap) { /* XXX on the global proc */ /* true args */ struct vnode *vp __attribute__((unused)) = ap->a_vp; /* file's vnode */ daddr_t bn __attribute__((unused)) = ap->a_bn; /* fs block number */ struct bufobj **bop = ap->a_bop; /* RETURN bufobj of device */ daddr_t *bnp __attribute__((unused)) = ap->a_bnp; /* RETURN device block number */ struct thread *td __attribute__((unused)) = curthread; /* upcall decl */ /* locals */ int ret = 0; struct cnode *cp; cp = VTOC(vp); if (cp->c_ovp) { return EINVAL; ret = VOP_BMAP(cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb); #if 0 printf("VOP_BMAP(cp->c_ovp %p, bn %p, bop %p, bnp %lld, ap->a_runp %p, ap->a_runb %p) = %d\n", cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb, ret); #endif return ret; } else { #if 0 printf("coda_bmap: no container\n"); #endif return(EOPNOTSUPP); } } int coda_reclaim(struct vop_reclaim_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ /* * Forced unmount/flush will let vnodes with non zero use be destroyed! */ ENTRY; if (IS_UNMOUNTING(cp)) { #ifdef DEBUG if (VTOC(vp)->c_ovp) { if (IS_UNMOUNTING(cp)) printf("coda_reclaim: c_ovp not void: vp %p, cp %p\n", vp, cp); } #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(vp) != 0) print("coda_reclaim: pushing active %p\n", vp); if (VTOC(vp)->c_ovp) { panic("coda_reclaim: c_ovp not void"); } #endif } cache_purge(vp); coda_free(VTOC(vp)); vp->v_data = NULL; vnode_destroy_vobject(vp); return (0); } int coda_lock(struct vop_lock1_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if ((ap->a_flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); ap->a_flags |= LK_INTERLOCK; } if (coda_lockdebug) { myprintf(("Attempting lock on %s\n", coda_f2s(&cp->c_fid))); } return (vop_stdlock(ap)); } int coda_unlock(struct vop_unlock_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting unlock on %s\n", coda_f2s(&cp->c_fid))); } return (vop_stdunlock(ap)); } int coda_islocked(struct vop_islocked_args *ap) { /* true args */ ENTRY; return (vop_stdislocked(ap)); } void print_vattr(struct vattr *attr) { char *typestr; switch (attr->va_type) { case VNON: typestr = "VNON"; break; case VREG: typestr = "VREG"; break; case VDIR: typestr = "VDIR"; break; case VBLK: typestr = "VBLK"; break; case VCHR: typestr = "VCHR"; break; case VLNK: typestr = "VLNK"; break; case VSOCK: typestr = "VSCK"; break; case VFIFO: typestr = "VFFO"; break; case VBAD: typestr = "VBAD"; break; default: typestr = "????"; break; } myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n", typestr, (int)attr->va_mode, (int)attr->va_uid, (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev)); myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n", (int)attr->va_fileid, (int)attr->va_nlink, (int)attr->va_size, (int)attr->va_blocksize,(int)attr->va_bytes)); myprintf((" gen %ld flags %ld vaflags %d\n", attr->va_gen, attr->va_flags, attr->va_vaflags)); myprintf((" atime sec %d nsec %d\n", (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec)); myprintf((" mtime sec %d nsec %d\n", (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec)); myprintf((" ctime sec %d nsec %d\n", (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec)); } /* How to print a ucred */ void print_cred(struct ucred *cred) { int i; myprintf(("ref %d\tuid %d\n",cred->cr_ref,cred->cr_uid)); for (i=0; i < cred->cr_ngroups; i++) myprintf(("\tgroup %d: (%d)\n",i,cred->cr_groups[i])); myprintf(("\n")); } /* * Return a vnode for the given fid. * If no cnode exists for this fid create one and put it * in a table hashed by coda_f2i(). If the cnode for * this fid is already in the table return it (ref count is * incremented by coda_find. The cnode will be flushed from the * table when coda_inactive calls coda_unsave. */ struct cnode * make_coda_node(CodaFid *fid, struct mount *vfsp, short type) { struct cnode *cp; int err; if ((cp = coda_find(fid)) == NULL) { struct vnode *vp; cp = coda_alloc(); cp->c_fid = *fid; err = getnewvnode("coda", vfsp, &coda_vnodeops, &vp); if (err) { panic("coda: getnewvnode returned error %d\n", err); } err = insmntque1(vp, vfsp, NULL, NULL); /* XXX: Too early for mpsafe fs */ if (err != 0) panic("coda: insmntque failed: error %d", err); vp->v_data = cp; vp->v_type = type; cp->c_vnode = vp; coda_save(cp); } else { vref(CTOV(cp)); } return cp; } int coda_pathconf(struct vop_pathconf_args *ap) { int error; register_t *retval; retval = ap->a_retval; error = 0; switch (ap->a_name) { case _PC_NAME_MAX: *retval = CODA_MAXNAMLEN; break; case _PC_PATH_MAX: *retval = CODA_MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } Index: head/sys/fs/devfs/devfs_devs.c =================================================================== --- head/sys/fs/devfs/devfs_devs.c (revision 175201) +++ head/sys/fs/devfs/devfs_devs.c (revision 175202) @@ -1,541 +1,541 @@ /*- * Copyright (c) 2000,2004 * Poul-Henning Kamp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vfsops.c 1.36 * * $FreeBSD$ */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The one true (but secret) list of active devices in the system. * Locked by dev_lock()/devmtx */ struct cdev_priv_list cdevp_list = TAILQ_HEAD_INITIALIZER(cdevp_list); struct unrhdr *devfs_inos; static MALLOC_DEFINE(M_DEVFS2, "DEVFS2", "DEVFS data 2"); static MALLOC_DEFINE(M_DEVFS3, "DEVFS3", "DEVFS data 3"); static MALLOC_DEFINE(M_CDEVP, "DEVFS1", "DEVFS cdev_priv storage"); static SYSCTL_NODE(_vfs, OID_AUTO, devfs, CTLFLAG_RW, 0, "DEVFS filesystem"); static unsigned devfs_generation; SYSCTL_UINT(_vfs_devfs, OID_AUTO, generation, CTLFLAG_RD, &devfs_generation, 0, "DEVFS generation number"); unsigned devfs_rule_depth = 1; SYSCTL_UINT(_vfs_devfs, OID_AUTO, rule_depth, CTLFLAG_RW, &devfs_rule_depth, 0, "Max depth of ruleset include"); /* * Helper sysctl for devname(3). We're given a struct cdev * and return * the name, if any, registered by the device driver. */ static int sysctl_devname(SYSCTL_HANDLER_ARGS) { int error; dev_t ud; struct cdev_priv *cdp; error = SYSCTL_IN(req, &ud, sizeof (ud)); if (error) return (error); if (ud == NODEV) return(EINVAL); /* ud ^ devfs_random(); */ dev_lock(); TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) if (cdp->cdp_inode == ud) break; dev_unlock(); if (cdp == NULL) return(ENOENT); return(SYSCTL_OUT(req, cdp->cdp_c.si_name, strlen(cdp->cdp_c.si_name) + 1)); return (error); } SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY, NULL, 0, sysctl_devname, "", "devname(3) handler"); SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev, CTLFLAG_RD, 0, sizeof(struct cdev), "sizeof(struct cdev)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev_priv, CTLFLAG_RD, 0, sizeof(struct cdev_priv), "sizeof(struct cdev_priv)"); struct cdev * devfs_alloc(void) { struct cdev_priv *cdp; struct cdev *cdev; cdp = malloc(sizeof *cdp, M_CDEVP, M_USE_RESERVE | M_ZERO | M_WAITOK); cdp->cdp_dirents = &cdp->cdp_dirent0; cdp->cdp_dirent0 = NULL; cdp->cdp_maxdirent = 0; cdev = &cdp->cdp_c; cdev->si_priv = cdp; cdev->si_name = cdev->__si_namebuf; LIST_INIT(&cdev->si_children); return (cdev); } void devfs_free(struct cdev *cdev) { struct cdev_priv *cdp; cdp = cdev->si_priv; if (cdev->si_cred != NULL) crfree(cdev->si_cred); if (cdp->cdp_inode > 0) free_unr(devfs_inos, cdp->cdp_inode); if (cdp->cdp_maxdirent > 0) free(cdp->cdp_dirents, M_DEVFS2); free(cdp, M_CDEVP); } struct devfs_dirent * devfs_find(struct devfs_dirent *dd, const char *name, int namelen) { struct devfs_dirent *de; TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (namelen != de->de_dirent->d_namlen) continue; if (bcmp(name, de->de_dirent->d_name, namelen) != 0) continue; break; } return (de); } struct devfs_dirent * devfs_newdirent(char *name, int namelen) { int i; struct devfs_dirent *de; struct dirent d; d.d_namlen = namelen; i = sizeof (*de) + GENERIC_DIRSIZ(&d); de = malloc(i, M_DEVFS3, M_WAITOK | M_ZERO); de->de_dirent = (struct dirent *)(de + 1); de->de_dirent->d_namlen = namelen; de->de_dirent->d_reclen = GENERIC_DIRSIZ(&d); bcopy(name, de->de_dirent->d_name, namelen); de->de_dirent->d_name[namelen] = '\0'; vfs_timestamp(&de->de_ctime); de->de_mtime = de->de_atime = de->de_ctime; de->de_links = 1; de->de_holdcnt = 1; #ifdef MAC mac_devfs_init(de); #endif return (de); } struct devfs_dirent * devfs_vmkdir(struct devfs_mount *dmp, char *name, int namelen, struct devfs_dirent *dotdot, u_int inode) { struct devfs_dirent *dd; struct devfs_dirent *de; /* Create the new directory */ dd = devfs_newdirent(name, namelen); TAILQ_INIT(&dd->de_dlist); dd->de_dirent->d_type = DT_DIR; dd->de_mode = 0555; dd->de_links = 2; dd->de_dir = dd; if (inode != 0) dd->de_inode = inode; else dd->de_inode = alloc_unr(devfs_inos); /* Create the "." entry in the new directory */ de = devfs_newdirent(".", 1); de->de_dirent->d_type = DT_DIR; de->de_flags |= DE_DOT; TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); de->de_dir = dd; /* Create the ".." entry in the new directory */ de = devfs_newdirent("..", 2); de->de_dirent->d_type = DT_DIR; de->de_flags |= DE_DOTDOT; TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); if (dotdot == NULL) { de->de_dir = dd; } else { de->de_dir = dotdot; TAILQ_INSERT_TAIL(&dotdot->de_dlist, dd, de_list); dotdot->de_links++; } #ifdef MAC mac_devfs_create_directory(dmp->dm_mount, name, namelen, dd); #endif return (dd); } void devfs_dirent_free(struct devfs_dirent *de) { free(de, M_DEVFS3); } /* * The caller needs to hold the dm for the duration of the call since * dm->dm_lock may be temporary dropped. */ void devfs_delete(struct devfs_mount *dm, struct devfs_dirent *de, int vp_locked) { struct vnode *vp; struct thread *td; KASSERT((de->de_flags & DE_DOOMED) == 0, ("devfs_delete doomed dirent")); td = curthread; de->de_flags |= DE_DOOMED; mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); vholdl(vp); sx_unlock(&dm->dm_lock); if (!vp_locked) - vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); else VI_UNLOCK(vp); vgone(vp); if (!vp_locked) VOP_UNLOCK(vp, 0, td); vdrop(vp); sx_xlock(&dm->dm_lock); } else mtx_unlock(&devfs_de_interlock); if (de->de_symlink) { free(de->de_symlink, M_DEVFS); de->de_symlink = NULL; } #ifdef MAC mac_devfs_destroy(de); #endif if (de->de_inode > DEVFS_ROOTINO) { free_unr(devfs_inos, de->de_inode); de->de_inode = 0; } if (DEVFS_DE_DROP(de)) devfs_dirent_free(de); } /* * Called on unmount. * Recursively removes the entire tree. * The caller needs to hold the dm for the duration of the call. */ static void devfs_purge(struct devfs_mount *dm, struct devfs_dirent *dd) { struct devfs_dirent *de; sx_assert(&dm->dm_lock, SX_XLOCKED); for (;;) { de = TAILQ_FIRST(&dd->de_dlist); if (de == NULL) break; TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_flags & (DE_DOT|DE_DOTDOT)) devfs_delete(dm, de, 0); else if (de->de_dirent->d_type == DT_DIR) devfs_purge(dm, de); else devfs_delete(dm, de, 0); } devfs_delete(dm, dd, 0); } /* * Each cdev_priv has an array of pointers to devfs_dirent which is indexed * by the mount points dm_idx. * This function extends the array when necessary, taking into account that * the default array is 1 element and not malloc'ed. */ static void devfs_metoo(struct cdev_priv *cdp, struct devfs_mount *dm) { struct devfs_dirent **dep; int siz; siz = (dm->dm_idx + 1) * sizeof *dep; dep = malloc(siz, M_DEVFS2, M_WAITOK | M_ZERO); dev_lock(); if (dm->dm_idx <= cdp->cdp_maxdirent) { /* We got raced */ dev_unlock(); free(dep, M_DEVFS2); return; } memcpy(dep, cdp->cdp_dirents, (cdp->cdp_maxdirent + 1) * sizeof *dep); if (cdp->cdp_maxdirent > 0) free(cdp->cdp_dirents, M_DEVFS2); cdp->cdp_dirents = dep; /* * XXX: if malloc told us how much we actually got this could * XXX: be optimized. */ cdp->cdp_maxdirent = dm->dm_idx; dev_unlock(); } /* * The caller needs to hold the dm for the duration of the call. */ static int devfs_populate_loop(struct devfs_mount *dm, int cleanup) { struct cdev_priv *cdp; struct devfs_dirent *de; struct devfs_dirent *dd; struct cdev *pdev; int j; char *q, *s; sx_assert(&dm->dm_lock, SX_XLOCKED); dev_lock(); TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) { KASSERT(cdp->cdp_dirents != NULL, ("NULL cdp_dirents")); /* * If we are unmounting, or the device has been destroyed, * clean up our dirent. */ if ((cleanup || !(cdp->cdp_flags & CDP_ACTIVE)) && dm->dm_idx <= cdp->cdp_maxdirent && cdp->cdp_dirents[dm->dm_idx] != NULL) { de = cdp->cdp_dirents[dm->dm_idx]; cdp->cdp_dirents[dm->dm_idx] = NULL; KASSERT(cdp == de->de_cdp, ("%s %d %s %p %p", __func__, __LINE__, cdp->cdp_c.si_name, cdp, de->de_cdp)); KASSERT(de->de_dir != NULL, ("Null de->de_dir")); dev_unlock(); TAILQ_REMOVE(&de->de_dir->de_dlist, de, de_list); de->de_cdp = NULL; de->de_inode = 0; devfs_delete(dm, de, 0); dev_lock(); cdp->cdp_inuse--; dev_unlock(); return (1); } /* * GC any lingering devices */ if (!(cdp->cdp_flags & CDP_ACTIVE)) { if (cdp->cdp_inuse > 0) continue; TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); return (1); } /* * Don't create any new dirents if we are unmounting */ if (cleanup) continue; KASSERT((cdp->cdp_flags & CDP_ACTIVE), ("Bogons, I tell ya'!")); if (dm->dm_idx <= cdp->cdp_maxdirent && cdp->cdp_dirents[dm->dm_idx] != NULL) { de = cdp->cdp_dirents[dm->dm_idx]; KASSERT(cdp == de->de_cdp, ("inconsistent cdp")); continue; } cdp->cdp_inuse++; dev_unlock(); if (dm->dm_idx > cdp->cdp_maxdirent) devfs_metoo(cdp, dm); dd = dm->dm_rootdir; s = cdp->cdp_c.si_name; for (;;) { for (q = s; *q != '/' && *q != '\0'; q++) continue; if (*q != '/') break; de = devfs_find(dd, s, q - s); if (de == NULL) de = devfs_vmkdir(dm, s, q - s, dd, 0); s = q + 1; dd = de; } de = devfs_newdirent(s, q - s); if (cdp->cdp_c.si_flags & SI_ALIAS) { de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_dirent->d_type = DT_LNK; pdev = cdp->cdp_c.si_parent; j = strlen(pdev->si_name) + 1; de->de_symlink = malloc(j, M_DEVFS, M_WAITOK); bcopy(pdev->si_name, de->de_symlink, j); } else { de->de_uid = cdp->cdp_c.si_uid; de->de_gid = cdp->cdp_c.si_gid; de->de_mode = cdp->cdp_c.si_mode; de->de_dirent->d_type = DT_CHR; } de->de_inode = cdp->cdp_inode; de->de_cdp = cdp; #ifdef MAC mac_devfs_create_device(cdp->cdp_c.si_cred, dm->dm_mount, &cdp->cdp_c, de); #endif de->de_dir = dd; TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); devfs_rules_apply(dm, de); dev_lock(); /* XXX: could check that cdp is still active here */ KASSERT(cdp->cdp_dirents[dm->dm_idx] == NULL, ("%s %d\n", __func__, __LINE__)); cdp->cdp_dirents[dm->dm_idx] = de; KASSERT(de->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); dev_unlock(); return (1); } dev_unlock(); return (0); } /* * The caller needs to hold the dm for the duration of the call. */ void devfs_populate(struct devfs_mount *dm) { sx_assert(&dm->dm_lock, SX_XLOCKED); if (dm->dm_generation == devfs_generation) return; while (devfs_populate_loop(dm, 0)) continue; dm->dm_generation = devfs_generation; } /* * The caller needs to hold the dm for the duration of the call. */ void devfs_cleanup(struct devfs_mount *dm) { sx_assert(&dm->dm_lock, SX_XLOCKED); while (devfs_populate_loop(dm, 1)) continue; devfs_purge(dm, dm->dm_rootdir); } /* * devfs_create() and devfs_destroy() are called from kern_conf.c and * in both cases the devlock() mutex is held, so no further locking * is necesary and no sleeping allowed. */ void devfs_create(struct cdev *dev) { struct cdev_priv *cdp; mtx_assert(&devmtx, MA_OWNED); cdp = dev->si_priv; cdp->cdp_flags |= CDP_ACTIVE; cdp->cdp_inode = alloc_unrl(devfs_inos); dev_refl(dev); TAILQ_INSERT_TAIL(&cdevp_list, cdp, cdp_list); devfs_generation++; } void devfs_destroy(struct cdev *dev) { struct cdev_priv *cdp; mtx_assert(&devmtx, MA_OWNED); cdp = dev->si_priv; cdp->cdp_flags &= ~CDP_ACTIVE; devfs_generation++; } static void devfs_devs_init(void *junk __unused) { devfs_inos = new_unrhdr(DEVFS_ROOTINO + 1, INT_MAX, &devmtx); } SYSINIT(devfs_devs, SI_SUB_DEVFS, SI_ORDER_FIRST, devfs_devs_init, NULL); Index: head/sys/fs/devfs/devfs_vnops.c =================================================================== --- head/sys/fs/devfs/devfs_vnops.c (revision 175201) +++ head/sys/fs/devfs/devfs_vnops.c (revision 175202) @@ -1,1412 +1,1412 @@ /*- * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * remove empty directories * mkdir: want it ? */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp) { *dswp = devvn_refthread(fp->f_vnode, devp); if (*devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); return (0); } /* * Construct the fully qualified path name relative to the mountpoint */ static char * devfs_fqpn(char *buf, struct vnode *dvp, struct componentname *cnp) { int i; struct devfs_dirent *de, *dd; struct devfs_mount *dmp; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; i = SPECNAMELEN; buf[i] = '\0'; i -= cnp->cn_namelen; if (i < 0) return (NULL); bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { i--; if (i < 0) return (NULL); buf[i] = '/'; i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = TAILQ_FIRST(&de->de_dlist); /* "." */ de = TAILQ_NEXT(de, de_list); /* ".." */ de = de->de_dir; } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, struct vnode **vpp, struct thread *td) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; KASSERT(td == curthread, ("devfs_allocv: td != curthread")); dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { if (error == 0) vput(vp); return (ENOENT); } else if (error) goto loop; sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; KASSERT(vp->v_usecount == 1, ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount)); dev->si_usecount += vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_mode, ap->a_cred, NULL); if (!error) return (error); if (error != EACCES) return (error); /* We do, however, allow access to the controlling terminal */ if (!(ap->a_td->td_proc->p_flag & P_CONTROLT)) return (error); if (ap->a_td->td_proc->p_session->s_ttyvp == de->de_vnode) return (0); return (error); } /* ARGSUSED */ static int devfs_advlock(struct vop_advlock_args *ap) { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* ARGSUSED */ static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int vp_locked, error; /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ oldvp = NULL; sx_xlock(&proctree_lock); if (td && vp == td->td_proc->p_session->s_ttyvp) { SESS_LOCK(td->td_proc->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { td->td_proc->p_session->s_ttyvp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(td->td_proc->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { /* Forced close. */ } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev); return (0); } vholdl(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp, td); VOP_UNLOCK(vp, 0, td); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); PICKUP_GIANT(); } else { error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); } dev_relthread(dev); - vn_lock(vp, vp_locked | LK_RETRY, td); + vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { return (vnops.fo_close(fp, td)); } /* ARGSUSED */ static int devfs_fsync(struct vop_fsync_args *ap) { if (!vn_isdisk(ap->a_vp, NULL)) return (0); return (vop_stdfsync(ap)); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; int error = 0; struct devfs_dirent *de; struct cdev *dev; de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } bzero((caddr_t) vap, sizeof(*vap)); vattr_null(vap); vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = dev->si_priv->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct vnode *vp; struct vnode *vpold; int error, i; const char *p; struct fiodgname_arg *fgn; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); if (com == FIODTYPE) { *(int *)data = dsw->d_flags & D_TYPEMASK; dev_relthread(dev); return (0); } else if (com == FIODGNAME) { fgn = data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fgn->buf, i); dev_relthread(dev); return (error); } error = dsw->d_ioctl(dev, com, data, fp->f_flag, td); dev_relthread(dev); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { vp = fp->f_vnode; /* Do nothing if reassigning same control tty */ sx_slock(&proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { sx_sunlock(&proctree_lock); return (0); } mtx_lock(&Giant); /* XXX TTY */ vpold = td->td_proc->p_session->s_ttyvp; VREF(vp); SESS_LOCK(td->td_proc->p_session); td->td_proc->p_session->s_ttyvp = vp; SESS_UNLOCK(td->td_proc->p_session); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) vrele(vpold); mtx_unlock(&Giant); /* XXX TTY */ } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); error = dsw->d_kqfilter(dev, kn); dev_relthread(dev); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct cdev *cdev; int error, flags, nameiop; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0, td); de = TAILQ_FIRST(&dd->de_dlist); /* "." */ de = TAILQ_NEXT(de, de_list); /* ".." */ de = de->de_dir; error = devfs_allocv(de, dvp->v_mount, vpp, td); *dm_unlock = 0; - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dvp, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } if (cdev == NULL) break; DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } dev_lock(); dde = &cdev->si_priv->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, dvp->v_mount, vpp, td); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; sx_xlock(&dmp->dm_lock); j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; td = cnp->cn_thread; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, vpp, td); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error; struct cdevsw *dsw; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); /* XXX: Special casing of ttys for deadfs. Probably redundant. */ if (dsw->d_flags & D_TTY) vp->v_vflag |= VV_ISTTY; VOP_UNLOCK(vp, 0, td); if(!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); PICKUP_GIANT(); } else { if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); dev_relthread(dev); if (error) return (error); #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if(fp == NULL) return (error); #endif KASSERT(fp->f_ops == &badfileops, ("Could not vnode bypass device on fdops %p", fp->f_ops)); finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); error = dsw->d_poll(dev, events, td); dev_relthread(dev); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } /* ARGSUSED */ static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, resid; struct cdevsw *dsw; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) vfs_timestamp(&dev->si_atime); dev_relthread(dev); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off, oldoff; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; oldoff = uio->uio_offset; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & DE_WHITEOUT) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); de = vp->v_data; if (de != NULL) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vnode_destroy_vobject(vp); VI_LOCK(vp); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; if (dev == NULL) { dev_unlock(); VI_UNLOCK(vp); return (0); } dev->si_usecount -= vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); devfs_delete(dmp, de, 1); } else { de->de_flags |= DE_WHITEOUT; } sx_xunlock(&dmp->dm_lock); return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = dev->si_priv; dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp,0,curthread); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); VI_LOCK(vp2); mtx_unlock(&devfs_de_interlock); if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { int error; struct devfs_mount *dmp; dmp = VFSTODEVFS(ap->a_vp->v_mount); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(ap->a_td, PRIV_VFS_CHOWN); if (error) return (error); } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(ap->a_td, PRIV_VFS_ADMIN); if (error) return (error); } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } return (0); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; struct thread *td; td = ap->a_cnp->cn_thread; KASSERT(td == curthread, ("devfs_symlink: td != curthread")); error = priv_check(td, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); sx_xlock(&dmp->dm_lock); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); return (devfs_allocv(de, ap->a_dvp->v_mount, ap->a_vpp, td)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } /* ARGSUSED */ static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, resid; struct cdevsw *dsw; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { vfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } dev_relthread(dev); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (x->si_priv->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, }; static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_advlock = devfs_advlock, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = devfs_fsync, .vop_getattr = devfs_getattr, .vop_lease = VOP_NULL, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_print = devfs_print, .vop_read = VOP_PANIC, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_write = VOP_PANIC, }; /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); Index: head/sys/fs/fdescfs/fdesc_vfsops.c =================================================================== --- head/sys/fs/fdescfs/fdesc_vfsops.c (revision 175201) +++ head/sys/fs/fdescfs/fdesc_vfsops.c (revision 175202) @@ -1,214 +1,214 @@ /*- * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vfsops.c 8.4 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FDESCMNT, "fdesc_mount", "FDESC mount structure"); static vfs_cmount_t fdesc_cmount; static vfs_mount_t fdesc_mount; static vfs_unmount_t fdesc_unmount; static vfs_statfs_t fdesc_statfs; static vfs_root_t fdesc_root; /* * Compatibility shim for old mount(2) system call. */ int fdesc_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { return kernel_mount(ma, flags); } /* * Mount the per-process file descriptors (/dev/fd) */ static int fdesc_mount(struct mount *mp, struct thread *td) { int error = 0; struct fdescmount *fmp; struct vnode *rvp; /* * Update is a no-op */ if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS)) return (EOPNOTSUPP); error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp, td); if (error) return (error); MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount), M_FDESCMNT, M_WAITOK); /* XXX */ rvp->v_type = VDIR; rvp->v_vflag |= VV_ROOT; fmp->f_root = rvp; /* XXX -- don't mark as local to work around fts() problems */ /*mp->mnt_flag |= MNT_LOCAL;*/ mp->mnt_data = fmp; vfs_getnewfsid(mp); vfs_mountedfrom(mp, "fdescfs"); return (0); } static int fdesc_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { int error; int flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* * Clear out buffer cache. I don't think we * ever get anything cached at this level at the * moment, but who knows... * * There is 1 extra root vnode reference corresponding * to f_root. */ if ((error = vflush(mp, 1, flags, td)) != 0) return (error); /* * Finally, throw away the fdescmount structure */ free(mp->mnt_data, M_FDESCMNT); /* XXX */ mp->mnt_data = 0; return (0); } static int fdesc_root(mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { struct vnode *vp; /* * Return locked reference to root. */ vp = VFSTOFDESC(mp)->f_root; VREF(vp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); *vpp = vp; return (0); } static int fdesc_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct filedesc *fdp; int lim; int i; int last; int freefd; /* * Compute number of free file descriptors. * [ Strange results will ensue if the open file * limit is ever reduced below the current number * of open files... ] */ PROC_LOCK(td->td_proc); lim = lim_cur(td->td_proc, RLIMIT_NOFILE); PROC_UNLOCK(td->td_proc); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); last = min(fdp->fd_nfiles, lim); freefd = 0; for (i = fdp->fd_freefile; i < last; i++) if (fdp->fd_ofiles[i] == NULL) freefd++; /* * Adjust for the fact that the fdesc array may not * have been fully allocated yet. */ if (fdp->fd_nfiles < lim) freefd += (lim - fdp->fd_nfiles); FILEDESC_SUNLOCK(fdp); sbp->f_flags = 0; sbp->f_bsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1K to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = lim + 1; /* Allow for "." */ sbp->f_ffree = freefd; /* See comments above */ return (0); } static struct vfsops fdesc_vfsops = { .vfs_cmount = fdesc_cmount, .vfs_init = fdesc_init, .vfs_mount = fdesc_mount, .vfs_root = fdesc_root, .vfs_statfs = fdesc_statfs, .vfs_unmount = fdesc_unmount, }; VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC); Index: head/sys/fs/fdescfs/fdesc_vnops.c =================================================================== --- head/sys/fs/fdescfs/fdesc_vnops.c (revision 175201) +++ head/sys/fs/fdescfs/fdesc_vnops.c (revision 175202) @@ -1,536 +1,536 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include /* boottime */ #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #define FDL_WANT 0x01 #define FDL_LOCKED 0x02 static int fdcache_lock; #define NFDCACHE 4 #define FD_NHASH(ix) \ (&fdhashtbl[(ix) & fdhash]) static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl; static u_long fdhash; static vop_getattr_t fdesc_getattr; static vop_inactive_t fdesc_inactive; static vop_lookup_t fdesc_lookup; static vop_open_t fdesc_open; static vop_readdir_t fdesc_readdir; static vop_reclaim_t fdesc_reclaim; static vop_setattr_t fdesc_setattr; static struct vop_vector fdesc_vnodeops = { .vop_default = &default_vnodeops, .vop_access = VOP_NULL, .vop_getattr = fdesc_getattr, .vop_inactive = fdesc_inactive, .vop_lookup = fdesc_lookup, .vop_open = fdesc_open, .vop_pathconf = vop_stdpathconf, .vop_readdir = fdesc_readdir, .vop_reclaim = fdesc_reclaim, .vop_setattr = fdesc_setattr, }; /* * Initialise cache headers */ int fdesc_init(vfsp) struct vfsconf *vfsp; { fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); return (0); } int fdesc_allocvp(ftype, ix, mp, vpp, td) fdntype ftype; int ix; struct mount *mp; struct vnode **vpp; struct thread *td; { struct fdhashhead *fc; struct fdescnode *fd; int error = 0; fc = FD_NHASH(ix); loop: LIST_FOREACH(fd, fc, fd_hash) { if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { if (vget(fd->fd_vnode, 0, td)) goto loop; *vpp = fd->fd_vnode; return (error); } } /* * otherwise lock the array while we call getnewvnode * since that can block. */ if (fdcache_lock & FDL_LOCKED) { fdcache_lock |= FDL_WANT; (void) tsleep( &fdcache_lock, PINOD, "fdalvp", 0); goto loop; } fdcache_lock |= FDL_LOCKED; /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ MALLOC(fd, struct fdescnode *, sizeof(struct fdescnode), M_TEMP, M_WAITOK); error = getnewvnode("fdesc", mp, &fdesc_vnodeops, vpp); if (error) { FREE(fd, M_TEMP); goto out; } (*vpp)->v_data = fd; fd->fd_vnode = *vpp; fd->fd_type = ftype; fd->fd_fd = -1; fd->fd_ix = ix; /* XXX: vnode should be locked here */ error = insmntque(*vpp, mp); /* XXX: Too early for mpsafe fs */ if (error != 0) { free(fd, M_TEMP); *vpp = NULLVP; goto out; } LIST_INSERT_HEAD(fc, fd, fd_hash); out: fdcache_lock &= ~FDL_LOCKED; if (fdcache_lock & FDL_WANT) { fdcache_lock &= ~FDL_WANT; wakeup( &fdcache_lock); } return (error); } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; struct file *fp; int nlen = cnp->cn_namelen; u_int fd; int error; struct vnode *fvp; if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad; } if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd = 10 * fd + *pname++ - '0'; } if ((error = fget(td, fd, &fp)) != 0) goto bad; error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp, td); fdrop(fp, td); if (error) goto bad; VTOFDESC(fvp)->fd_fd = fd; if (fvp != dvp) - vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY); *vpp = fvp; return (0); bad: *vpp = NULL; return (error); } static int fdesc_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; if (VTOFDESC(vp)->fd_type == Froot) return (0); /* * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the the file * descriptor being sought for duplication. The error return ensures * that the vnode for this device will be released by vn_open. Open * will detect this special error and take the actions in dupfdopen. * Other callers of vn_open or VOP_OPEN will simply report the * error. */ ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ return (ENODEV); } static int fdesc_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct file *fp; struct stat stb; u_int fd; int error = 0; switch (VTOFDESC(vp)->fd_type) { case Froot: VATTR_NULL(vap); vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_type = VDIR; vap->va_nlink = 2; vap->va_size = DEV_BSIZE; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_uid = 0; vap->va_gid = 0; vap->va_blocksize = DEV_BSIZE; vap->va_atime.tv_sec = boottime.tv_sec; vap->va_atime.tv_nsec = 0; vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = 0; break; case Fdesc: fd = VTOFDESC(vp)->fd_fd; if ((error = fget(ap->a_td, fd, &fp)) != 0) return (error); bzero(&stb, sizeof(stb)); error = fo_stat(fp, &stb, ap->a_td->td_ucred, ap->a_td); fdrop(fp, ap->a_td); if (error == 0) { VATTR_NULL(vap); vap->va_type = IFTOVT(stb.st_mode); vap->va_mode = stb.st_mode; #define FDRX (VREAD|VEXEC) if (vap->va_type == VDIR) vap->va_mode &= ~((FDRX)|(FDRX>>3)|(FDRX>>6)); #undef FDRX vap->va_nlink = 1; vap->va_flags = 0; vap->va_bytes = stb.st_blocks * stb.st_blksize; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_size = stb.st_size; vap->va_blocksize = stb.st_blksize; vap->va_rdev = stb.st_rdev; /* * If no time data is provided, use the current time. */ if (stb.st_atimespec.tv_sec == 0 && stb.st_atimespec.tv_nsec == 0) nanotime(&stb.st_atimespec); if (stb.st_ctimespec.tv_sec == 0 && stb.st_ctimespec.tv_nsec == 0) nanotime(&stb.st_ctimespec); if (stb.st_mtimespec.tv_sec == 0 && stb.st_mtimespec.tv_nsec == 0) nanotime(&stb.st_mtimespec); vap->va_atime = stb.st_atimespec; vap->va_mtime = stb.st_mtimespec; vap->va_ctime = stb.st_ctimespec; vap->va_uid = stb.st_uid; vap->va_gid = stb.st_gid; } break; default: panic("fdesc_getattr"); break; } if (error == 0) vp->v_type = vap->va_type; return (error); } static int fdesc_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp; struct mount *mp; struct file *fp; unsigned fd; int error; /* * Can't mess with the root vnode */ if (VTOFDESC(ap->a_vp)->fd_type == Froot) return (EACCES); fd = VTOFDESC(ap->a_vp)->fd_fd; /* * Allow setattr where there is an underlying vnode. */ error = getvnode(ap->a_td->td_proc->p_fd, fd, &fp); if (error) { /* * getvnode() returns EINVAL if the file descriptor is not * backed by a vnode. Silently drop all changes except * chflags(2) in this case. */ if (error == EINVAL) { if (vap->va_flags != VNOVAL) error = EOPNOTSUPP; else error = 0; } return (error); } vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td); VOP_UNLOCK(vp, 0, ap->a_td); vn_finished_write(mp); } fdrop(fp, ap->a_td); return (error); } #define UIO_MX 16 static int fdesc_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { struct uio *uio = ap->a_uio; struct filedesc *fdp; struct dirent d; struct dirent *dp = &d; int error, i, off, fcnt; /* * We don't allow exporting fdesc mounts, and currently local * requests do not need cookies. */ if (ap->a_ncookies) panic("fdesc_readdir: not hungry"); if (VTOFDESC(ap->a_vp)->fd_type != Froot) panic("fdesc_readdir: not dir"); off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || uio->uio_resid < UIO_MX) return (EINVAL); i = (u_int)off / UIO_MX; fdp = uio->uio_td->td_proc->p_fd; error = 0; fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { switch (i) { case 0: /* `.' */ case 1: /* `..' */ bzero((caddr_t)dp, UIO_MX); dp->d_fileno = i + FD_ROOT; dp->d_namlen = i + 1; dp->d_reclen = UIO_MX; bcopy("..", dp->d_name, dp->d_namlen); dp->d_name[i + 1] = '\0'; dp->d_type = DT_DIR; break; default: if (fdp->fd_ofiles[fcnt] == NULL) { FILEDESC_SUNLOCK(fdp); goto done; } bzero((caddr_t) dp, UIO_MX); dp->d_namlen = sprintf(dp->d_name, "%d", fcnt); dp->d_reclen = UIO_MX; dp->d_type = DT_UNKNOWN; dp->d_fileno = i + FD_DESC; break; } /* * And ship to userland */ FILEDESC_SUNLOCK(fdp); error = uiomove(dp, UIO_MX, uio); if (error) goto done; FILEDESC_SLOCK(fdp); i++; fcnt++; } FILEDESC_SUNLOCK(fdp); done: uio->uio_offset = i * UIO_MX; return (error); } static int fdesc_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; /* * Clear out the v_type field to avoid * nasty things happening in vgone(). */ vp->v_type = VNON; return (0); } static int fdesc_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct fdescnode *fd = VTOFDESC(vp); LIST_REMOVE(fd, fd_hash); FREE(vp->v_data, M_TEMP); vp->v_data = 0; return (0); } Index: head/sys/fs/fifofs/fifo_vnops.c =================================================================== --- head/sys/fs/fifofs/fifo_vnops.c (revision 175201) +++ head/sys/fs/fifofs/fifo_vnops.c (revision 175202) @@ -1,748 +1,748 @@ /*- * Copyright (c) 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fifo_vnops.c 8.10 (Berkeley) 5/27/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include /* XXXKSE */ #include #include #include #include #include #include #include #include #include static fo_rdwr_t fifo_read_f; static fo_rdwr_t fifo_write_f; static fo_ioctl_t fifo_ioctl_f; static fo_poll_t fifo_poll_f; static fo_kqfilter_t fifo_kqfilter_f; static fo_stat_t fifo_stat_f; static fo_close_t fifo_close_f; static fo_truncate_t fifo_truncate_f; struct fileops fifo_ops_f = { .fo_read = fifo_read_f, .fo_write = fifo_write_f, .fo_truncate = fifo_truncate_f, .fo_ioctl = fifo_ioctl_f, .fo_poll = fifo_poll_f, .fo_kqfilter = fifo_kqfilter_f, .fo_stat = fifo_stat_f, .fo_close = fifo_close_f, .fo_flags = DFLAG_PASSABLE }; /* * This structure is associated with the FIFO vnode and stores * the state associated with the FIFO. */ struct fifoinfo { struct socket *fi_readsock; struct socket *fi_writesock; long fi_readers; long fi_writers; }; static vop_print_t fifo_print; static vop_open_t fifo_open; static vop_close_t fifo_close; static vop_ioctl_t fifo_ioctl; static vop_kqfilter_t fifo_kqfilter; static vop_pathconf_t fifo_pathconf; static vop_advlock_t fifo_advlock; static void filt_fifordetach(struct knote *kn); static int filt_fiforead(struct knote *kn, long hint); static void filt_fifowdetach(struct knote *kn); static int filt_fifowrite(struct knote *kn, long hint); static void filt_fifodetach_notsup(struct knote *kn); static int filt_fifo_notsup(struct knote *kn, long hint); static struct filterops fiforead_filtops = { 1, NULL, filt_fifordetach, filt_fiforead }; static struct filterops fifowrite_filtops = { 1, NULL, filt_fifowdetach, filt_fifowrite }; static struct filterops fifo_notsup_filtops = { 1, NULL, filt_fifodetach_notsup, filt_fifo_notsup }; struct vop_vector fifo_specops = { .vop_default = &default_vnodeops, .vop_access = VOP_EBADF, .vop_advlock = fifo_advlock, .vop_close = fifo_close, .vop_create = VOP_PANIC, .vop_getattr = VOP_EBADF, .vop_ioctl = fifo_ioctl, .vop_kqfilter = fifo_kqfilter, .vop_lease = VOP_NULL, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = fifo_open, .vop_pathconf = fifo_pathconf, .vop_print = fifo_print, .vop_read = VOP_PANIC, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = VOP_NULL, .vop_remove = VOP_PANIC, .vop_rename = VOP_PANIC, .vop_rmdir = VOP_PANIC, .vop_setattr = VOP_EBADF, .vop_symlink = VOP_PANIC, .vop_write = VOP_PANIC, }; struct mtx fifo_mtx; MTX_SYSINIT(fifo, &fifo_mtx, "fifo mutex", MTX_DEF); /* * Dispose of fifo resources. */ static void fifo_cleanup(struct vnode *vp) { struct fifoinfo *fip = vp->v_fifoinfo; ASSERT_VOP_LOCKED(vp, "fifo_cleanup"); if (fip->fi_readers == 0 && fip->fi_writers == 0) { vp->v_fifoinfo = NULL; (void)soclose(fip->fi_readsock); (void)soclose(fip->fi_writesock); FREE(fip, M_VNODE); } } /* * Open called to set up a new instance of a fifo or * to find an active instance of a fifo. */ /* ARGSUSED */ static int fifo_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; } */ *ap; { struct vnode *vp = ap->a_vp; struct fifoinfo *fip; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; struct file *fp = ap->a_fp; struct socket *rso, *wso; int error; ASSERT_VOP_ELOCKED(vp, "fifo_open"); if (fp == NULL) return (EINVAL); if ((fip = vp->v_fifoinfo) == NULL) { MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK); error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, cred, td); if (error) goto fail1; fip->fi_readsock = rso; error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, cred, td); if (error) goto fail2; fip->fi_writesock = wso; error = soconnect2(wso, rso); if (error) { (void)soclose(wso); fail2: (void)soclose(rso); fail1: free(fip, M_VNODE); return (error); } fip->fi_readers = fip->fi_writers = 0; wso->so_snd.sb_lowat = PIPE_BUF; SOCKBUF_LOCK(&rso->so_rcv); rso->so_rcv.sb_state |= SBS_CANTRCVMORE; SOCKBUF_UNLOCK(&rso->so_rcv); KASSERT(vp->v_fifoinfo == NULL, ("fifo_open: v_fifoinfo race")); vp->v_fifoinfo = fip; } /* * General access to fi_readers and fi_writers is protected using * the vnode lock. * * Protect the increment of fi_readers and fi_writers and the * associated calls to wakeup() with the fifo mutex in addition * to the vnode lock. This allows the vnode lock to be dropped * for the msleep() calls below, and using the fifo mutex with * msleep() prevents the wakeup from being missed. */ mtx_lock(&fifo_mtx); if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { SOCKBUF_LOCK(&fip->fi_writesock->so_snd); fip->fi_writesock->so_snd.sb_state &= ~SBS_CANTSENDMORE; SOCKBUF_UNLOCK(&fip->fi_writesock->so_snd); if (fip->fi_writers > 0) { wakeup(&fip->fi_writers); sowwakeup(fip->fi_writesock); } } } if (ap->a_mode & FWRITE) { if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) { mtx_unlock(&fifo_mtx); return (ENXIO); } fip->fi_writers++; if (fip->fi_writers == 1) { SOCKBUF_LOCK(&fip->fi_readsock->so_rcv); fip->fi_readsock->so_rcv.sb_state &= ~SBS_CANTRCVMORE; SOCKBUF_UNLOCK(&fip->fi_readsock->so_rcv); if (fip->fi_readers > 0) { wakeup(&fip->fi_readers); sorwakeup(fip->fi_readsock); } } } if ((ap->a_mode & O_NONBLOCK) == 0) { if ((ap->a_mode & FREAD) && fip->fi_writers == 0) { VOP_UNLOCK(vp, 0, td); error = msleep(&fip->fi_readers, &fifo_mtx, PDROP | PCATCH | PSOCK, "fifoor", 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { fip->fi_readers--; if (fip->fi_readers == 0) { socantsendmore(fip->fi_writesock); fifo_cleanup(vp); } return (error); } mtx_lock(&fifo_mtx); /* * We must have got woken up because we had a writer. * That (and not still having one) is the condition * that we must wait for. */ } if ((ap->a_mode & FWRITE) && fip->fi_readers == 0) { VOP_UNLOCK(vp, 0, td); error = msleep(&fip->fi_writers, &fifo_mtx, PDROP | PCATCH | PSOCK, "fifoow", 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { fip->fi_writers--; if (fip->fi_writers == 0) { socantrcvmore(fip->fi_readsock); fifo_cleanup(vp); } return (error); } /* * We must have got woken up because we had * a reader. That (and not still having one) * is the condition that we must wait for. */ mtx_lock(&fifo_mtx); } } mtx_unlock(&fifo_mtx); KASSERT(fp != NULL, ("can't fifo/vnode bypass")); KASSERT(fp->f_ops == &badfileops, ("not badfileops in fifo_open")); finit(fp, fp->f_flag, DTYPE_FIFO, fip, &fifo_ops_f); return (0); } /* * Now unused vnode ioctl routine. */ /* ARGSUSED */ static int fifo_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { printf("WARNING: fifo_ioctl called unexpectedly\n"); return (ENOTTY); } /* * Now unused vnode kqfilter routine. */ /* ARGSUSED */ static int fifo_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { printf("WARNING: fifo_kqfilter called unexpectedly\n"); return (EINVAL); } static void filt_fifordetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK(&so->so_rcv); knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_rcv); } static int filt_fiforead(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_flags &= ~EV_EOF; return (kn->kn_data > 0); } } static void filt_fifowdetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK(&so->so_snd); knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_snd); } static int filt_fifowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= so->so_snd.sb_lowat); } } static void filt_fifodetach_notsup(struct knote *kn) { } static int filt_fifo_notsup(struct knote *kn, long hint) { return (0); } /* * Device close routine */ /* ARGSUSED */ static int fifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct fifoinfo *fip = vp->v_fifoinfo; ASSERT_VOP_LOCKED(vp, "fifo_close"); KASSERT(fip != NULL, ("fifo_close: no v_fifoinfo")); if (ap->a_fflag & FREAD) { fip->fi_readers--; if (fip->fi_readers == 0) socantsendmore(fip->fi_writesock); } if (ap->a_fflag & FWRITE) { fip->fi_writers--; if (fip->fi_writers == 0) socantrcvmore(fip->fi_readsock); } fifo_cleanup(vp); return (0); } /* * Print out internal contents of a fifo vnode. */ int fifo_printinfo(vp) struct vnode *vp; { register struct fifoinfo *fip = vp->v_fifoinfo; if (fip == NULL){ printf(", NULL v_fifoinfo"); return (0); } printf(", fifo with %ld readers and %ld writers", fip->fi_readers, fip->fi_writers); return (0); } /* * Print out the contents of a fifo vnode. */ static int fifo_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { fifo_printinfo(ap->a_vp); printf("\n"); return (0); } /* * Return POSIX pathconf information applicable to fifo's. */ static int fifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Fifo advisory byte-level locks. */ /* ARGSUSED */ static int fifo_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } static int fifo_close_f(struct file *fp, struct thread *td) { return (vnops.fo_close(fp, td)); } /* * The implementation of ioctl() for named fifos is complicated by the fact * that we permit O_RDWR fifo file descriptors, meaning that the actions of * ioctls may have to be applied to both the underlying sockets rather than * just one. The original implementation simply forward the ioctl to one * or both sockets based on fp->f_flag. We now consider each ioctl * separately, as the composition effect requires careful ordering. * * We do not blindly pass all ioctls through to the socket in order to avoid * providing unnecessary ioctls that might be improperly depended on by * applications (such as socket-specific, routing, and interface ioctls). * * Unlike sys_pipe.c, fifos do not implement the deprecated TIOCSPGRP and * TIOCGPGRP ioctls. Earlier implementations of fifos did forward SIOCSPGRP * and SIOCGPGRP ioctls, so we might need to re-add those here. */ static int fifo_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct fifoinfo *fi; struct file filetmp; /* Local, so need not be locked. */ int error; error = ENOTTY; fi = fp->f_data; switch (com) { case FIONBIO: /* * Non-blocking I/O is implemented at the fifo layer using * MSG_NBIO, so does not need to be forwarded down the stack. */ return (0); case FIOASYNC: case FIOSETOWN: case FIOGETOWN: /* * These socket ioctls don't have any ordering requirements, * so are called in an arbitrary order, and only on the * sockets indicated by the file descriptor rights. * * XXXRW: If O_RDWR and the read socket accepts an ioctl but * the write socket doesn't, the socketpair is left in an * inconsistent state. */ if (fp->f_flag & FREAD) { filetmp.f_data = fi->fi_readsock; filetmp.f_cred = cred; error = soo_ioctl(&filetmp, com, data, cred, td); if (error) return (error); } if (fp->f_flag & FWRITE) { filetmp.f_data = fi->fi_writesock; filetmp.f_cred = cred; error = soo_ioctl(&filetmp, com, data, cred, td); } return (error); case FIONREAD: /* * FIONREAD will return 0 for non-readable descriptors, and * the results of FIONREAD on the read socket for readable * descriptors. */ if (!(fp->f_flag & FREAD)) { *(int *)data = 0; return (0); } filetmp.f_data = fi->fi_readsock; filetmp.f_cred = cred; return (soo_ioctl(&filetmp, com, data, cred, td)); default: return (ENOTTY); } } /* * Because fifos are now a file descriptor layer object, EVFILT_VNODE is not * implemented. Likely, fifo_kqfilter() should be removed, and * fifo_kqfilter_f() should know how to forward the request to the underling * vnode using f_vnode in the file descriptor here. */ static int fifo_kqfilter_f(struct file *fp, struct knote *kn) { struct fifoinfo *fi; struct socket *so; struct sockbuf *sb; fi = fp->f_data; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &fifo_notsup_filtops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &fifo_notsup_filtops; return (0); } switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &fiforead_filtops; so = fi->fi_readsock; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &fifowrite_filtops; so = fi->fi_writesock; sb = &so->so_snd; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)so; SOCKBUF_LOCK(sb); knlist_add(&sb->sb_sel.si_note, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); return (0); } static int fifo_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct fifoinfo *fip; struct file filetmp; int levents, revents = 0; fip = fp->f_data; levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if ((fp->f_flag & FREAD) && levents) { /* * If POLLIN or POLLRDNORM is requested and POLLINIGNEOF is * not, then convert the first two to the last one. This * tells the socket poll function to ignore EOF so that we * block if there is no writer (and no data). Callers can * set POLLINIGNEOF to get non-blocking behavior. */ if (levents & (POLLIN | POLLRDNORM) && !(levents & POLLINIGNEOF)) { levents &= ~(POLLIN | POLLRDNORM); levents |= POLLINIGNEOF; } filetmp.f_data = fip->fi_readsock; filetmp.f_cred = cred; revents |= soo_poll(&filetmp, levents, cred, td); /* Reverse the above conversion. */ if ((revents & POLLINIGNEOF) && !(events & POLLINIGNEOF)) { revents |= (events & (POLLIN | POLLRDNORM)); revents &= ~POLLINIGNEOF; } } levents = events & (POLLOUT | POLLWRNORM | POLLWRBAND); if ((fp->f_flag & FWRITE) && levents) { filetmp.f_data = fip->fi_writesock; filetmp.f_cred = cred; revents |= soo_poll(&filetmp, levents, cred, td); } return (revents); } static int fifo_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct fifoinfo *fip; int error, sflags; fip = fp->f_data; KASSERT(uio->uio_rw == UIO_READ,("fifo_read mode")); if (uio->uio_resid == 0) return (0); sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0; mtx_lock(&Giant); error = soreceive(fip->fi_readsock, NULL, uio, NULL, NULL, &sflags); mtx_unlock(&Giant); return (error); } static int fifo_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int fifo_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int fifo_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct fifoinfo *fip; int error, sflags; fip = fp->f_data; KASSERT(uio->uio_rw == UIO_WRITE,("fifo_write mode")); sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0; mtx_lock(&Giant); error = sosend(fip->fi_writesock, NULL, uio, 0, NULL, sflags, td); mtx_unlock(&Giant); return (error); } Index: head/sys/fs/hpfs/hpfs_vnops.c =================================================================== --- head/sys/fs/hpfs/hpfs_vnops.c (revision 175201) +++ head/sys/fs/hpfs/hpfs_vnops.c (revision 175202) @@ -1,1258 +1,1258 @@ /*- * Copyright (c) 1998, 1999 Semen Ustimenko (semenu@FreeBSD.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for pathconf(2) constants */ #include #include #include #include static int hpfs_de_uiomove(struct hpfsmount *, struct hpfsdirent *, struct uio *); static vop_ioctl_t hpfs_ioctl; static vop_read_t hpfs_read; static vop_write_t hpfs_write; static vop_getattr_t hpfs_getattr; static vop_setattr_t hpfs_setattr; static vop_inactive_t hpfs_inactive; static vop_print_t hpfs_print; static vop_reclaim_t hpfs_reclaim; static vop_strategy_t hpfs_strategy; static vop_access_t hpfs_access; static vop_open_t hpfs_open; static vop_close_t hpfs_close; static vop_readdir_t hpfs_readdir; static vop_cachedlookup_t hpfs_lookup; static vop_create_t hpfs_create; static vop_remove_t hpfs_remove; static vop_bmap_t hpfs_bmap; static vop_fsync_t hpfs_fsync; static vop_pathconf_t hpfs_pathconf; static vop_vptofh_t hpfs_vptofh; static int hpfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { /* * Flush our dirty buffers. */ vop_stdfsync(ap); /* * Write out the on-disc version of the vnode. */ return hpfs_update(VTOHP(ap->a_vp)); } static int hpfs_ioctl ( struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap) { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); int error; printf("hpfs_ioctl(0x%x, 0x%lx, 0x%p, 0x%x): ", hp->h_no, ap->a_command, ap->a_data, ap->a_fflag); switch (ap->a_command) { case HPFSIOCGEANUM: { u_long eanum; u_long passed; struct ea *eap; eanum = 0; if (hp->h_fn.fn_ealen > 0) { eap = (struct ea *)&(hp->h_fn.fn_int); passed = 0; while (passed < hp->h_fn.fn_ealen) { printf("EAname: %s\n", EA_NAME(eap)); eanum++; passed += sizeof(struct ea) + eap->ea_namelen + 1 + eap->ea_vallen; eap = (struct ea *)((caddr_t)hp->h_fn.fn_int + passed); } error = 0; } else { error = ENOENT; } printf("%lu eas\n", eanum); *(u_long *)ap->a_data = eanum; break; } case HPFSIOCGEASZ: { u_long eanum; u_long passed; struct ea *eap; printf("EA%ld\n", *(u_long *)ap->a_data); eanum = 0; if (hp->h_fn.fn_ealen > 0) { eap = (struct ea *)&(hp->h_fn.fn_int); passed = 0; error = ENOENT; while (passed < hp->h_fn.fn_ealen) { printf("EAname: %s\n", EA_NAME(eap)); if (eanum == *(u_long *)ap->a_data) { *(u_long *)ap->a_data = eap->ea_namelen + 1 + eap->ea_vallen; error = 0; break; } eanum++; passed += sizeof(struct ea) + eap->ea_namelen + 1 + eap->ea_vallen; eap = (struct ea *)((caddr_t)hp->h_fn.fn_int + passed); } } else { error = ENOENT; } break; } case HPFSIOCRDEA: { u_long eanum; u_long passed; struct hpfs_rdea *rdeap; struct ea *eap; rdeap = (struct hpfs_rdea *)ap->a_data; printf("EA%ld\n", rdeap->ea_no); eanum = 0; if (hp->h_fn.fn_ealen > 0) { eap = (struct ea *)&(hp->h_fn.fn_int); passed = 0; error = ENOENT; while (passed < hp->h_fn.fn_ealen) { printf("EAname: %s\n", EA_NAME(eap)); if (eanum == rdeap->ea_no) { rdeap->ea_sz = eap->ea_namelen + 1 + eap->ea_vallen; copyout(EA_NAME(eap),rdeap->ea_data, rdeap->ea_sz); error = 0; break; } eanum++; passed += sizeof(struct ea) + eap->ea_namelen + 1 + eap->ea_vallen; eap = (struct ea *)((caddr_t)hp->h_fn.fn_int + passed); } } else { error = ENOENT; } break; } default: error = ENOTTY; break; } return (error); } /* * Map file offset to disk offset. */ int hpfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { register struct hpfsnode *hp = VTOHP(ap->a_vp); daddr_t blkno; int error; if (ap->a_bop != NULL) *ap->a_bop = &hp->h_devvp->v_bufobj; if (ap->a_runb != NULL) *ap->a_runb = 0; if (ap->a_bnp == NULL) return (0); dprintf(("hpfs_bmap(0x%x, 0x%x): ",hp->h_no, ap->a_bn)); error = hpfs_hpbmap (hp, ap->a_bn, &blkno, ap->a_runp); *ap->a_bnp = blkno; return (error); } static int hpfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); struct uio *uio = ap->a_uio; struct buf *bp; u_int xfersz, toread; u_int off; daddr_t lbn, bn; int resid; int runl; int error = 0; resid = min (uio->uio_resid, hp->h_fn.fn_size - uio->uio_offset); dprintf(("hpfs_read(0x%x, off: %d resid: %d, segflg: %d): [resid: 0x%x]\n",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg, resid)); while (resid) { lbn = uio->uio_offset >> DEV_BSHIFT; off = uio->uio_offset & (DEV_BSIZE - 1); dprintf(("hpfs_read: resid: 0x%x lbn: 0x%x off: 0x%x\n", uio->uio_resid, lbn, off)); error = hpfs_hpbmap(hp, lbn, &bn, &runl); if (error) return (error); toread = min(off + resid, min(DFLTPHYS, (runl+1)*DEV_BSIZE)); xfersz = (toread + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); dprintf(("hpfs_read: bn: 0x%x (0x%x) toread: 0x%x (0x%x)\n", bn, runl, toread, xfersz)); if (toread == 0) break; error = bread(hp->h_devvp, bn, xfersz, NOCRED, &bp); if (error) { brelse(bp); break; } error = uiomove(bp->b_data + off, toread - off, uio); if(error) { brelse(bp); break; } brelse(bp); resid -= toread; } dprintf(("hpfs_read: successful\n")); return (error); } static int hpfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); struct uio *uio = ap->a_uio; struct buf *bp; u_int xfersz, towrite; u_int off; daddr_t lbn, bn; int runl; int error = 0; dprintf(("hpfs_write(0x%x, off: %d resid: %d, segflg: %d):\n",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); if (ap->a_ioflag & IO_APPEND) { dprintf(("hpfs_write: APPEND mode\n")); uio->uio_offset = hp->h_fn.fn_size; } if (uio->uio_offset + uio->uio_resid > hp->h_fn.fn_size) { error = hpfs_extend (hp, uio->uio_offset + uio->uio_resid); if (error) { printf("hpfs_write: hpfs_extend FAILED %d\n", error); return (error); } } while (uio->uio_resid) { lbn = uio->uio_offset >> DEV_BSHIFT; off = uio->uio_offset & (DEV_BSIZE - 1); dprintf(("hpfs_write: resid: 0x%x lbn: 0x%x off: 0x%x\n", uio->uio_resid, lbn, off)); error = hpfs_hpbmap(hp, lbn, &bn, &runl); if (error) return (error); towrite = min(off + uio->uio_resid, min(DFLTPHYS, (runl+1)*DEV_BSIZE)); xfersz = (towrite + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); dprintf(("hpfs_write: bn: 0x%x (0x%x) towrite: 0x%x (0x%x)\n", bn, runl, towrite, xfersz)); if ((off == 0) && (towrite == xfersz)) { bp = getblk(hp->h_devvp, bn, xfersz, 0, 0, 0); clrbuf(bp); } else { error = bread(hp->h_devvp, bn, xfersz, NOCRED, &bp); if (error) { brelse(bp); return (error); } } error = uiomove(bp->b_data + off, towrite - off, uio); if(error) { brelse(bp); return (error); } if (ap->a_ioflag & IO_SYNC) bwrite(bp); else bawrite(bp); } dprintf(("hpfs_write: successful\n")); return (0); } /* * XXXXX do we need hpfsnode locking inside? */ static int hpfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); register struct vattr *vap = ap->a_vap; int error; dprintf(("hpfs_getattr(0x%x):\n", hp->h_no)); vap->va_fsid = dev2udev(hp->h_dev); vap->va_fileid = hp->h_no; vap->va_mode = hp->h_mode; vap->va_nlink = 1; vap->va_uid = hp->h_uid; vap->va_gid = hp->h_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = hp->h_fn.fn_size; vap->va_bytes = ((hp->h_fn.fn_size + DEV_BSIZE-1) & ~(DEV_BSIZE-1)) + DEV_BSIZE; if (!(hp->h_flag & H_PARVALID)) { error = hpfs_validateparent(hp); if (error) return (error); } vap->va_atime = hpfstimetounix(hp->h_atime); vap->va_mtime = hpfstimetounix(hp->h_mtime); vap->va_ctime = hpfstimetounix(hp->h_ctime); vap->va_flags = 0; vap->va_gen = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * XXXXX do we need hpfsnode locking inside? */ static int hpfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct hpfsnode *hp = VTOHP(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; int error; dprintf(("hpfs_setattr(0x%x):\n", hp->h_no)); /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { dprintf(("hpfs_setattr: changing nonsettable attr\n")); return (EINVAL); } /* Can't change flags XXX Could be implemented */ if (vap->va_flags != VNOVAL) { printf("hpfs_setattr: FLAGS CANNOT BE SET\n"); return (EINVAL); } /* Can't change uid/gid XXX Could be implemented */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { printf("hpfs_setattr: UID/GID CANNOT BE SET\n"); return (EINVAL); } /* Can't change mode XXX Could be implemented */ if (vap->va_mode != (mode_t)VNOVAL) { printf("hpfs_setattr: MODE CANNOT BE SET\n"); return (EINVAL); } /* Update times */ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (vap->va_vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, cred, td); if (error) error = VOP_ACCESS(vp, VWRITE, cred, td); } else error = VOP_ACCESS(vp, VADMIN, cred, td); if (vap->va_atime.tv_sec != VNOVAL) hp->h_atime = vap->va_atime.tv_sec; if (vap->va_mtime.tv_sec != VNOVAL) hp->h_mtime = vap->va_mtime.tv_sec; hp->h_flag |= H_PARCHANGE; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: printf("hpfs_setattr: WRONG v_type\n"); return (EINVAL); } if (vap->va_size < hp->h_fn.fn_size) { error = vtruncbuf(vp, cred, td, vap->va_size, DEV_BSIZE); if (error) return (error); error = hpfs_truncate(hp, vap->va_size); if (error) return (error); } else if (vap->va_size > hp->h_fn.fn_size) { vnode_pager_setsize(vp, vap->va_size); error = hpfs_extend(hp, vap->va_size); if (error) return (error); } } return (0); } /* * Last reference to a node. If necessary, write or delete it. */ int hpfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); int error; dprintf(("hpfs_inactive(0x%x): \n", hp->h_no)); if (hp->h_flag & H_CHANGE) { dprintf(("hpfs_inactive: node changed, update\n")); error = hpfs_update (hp); if (error) return (error); } if (hp->h_flag & H_PARCHANGE) { dprintf(("hpfs_inactive: parent node changed, update\n")); error = hpfs_updateparent (hp); if (error) return (error); } if (prtactive && vrefcnt(vp) != 0) vprint("hpfs_inactive: pushing active", vp); if (hp->h_flag & H_INVAL) { vrecycle(vp, ap->a_td); return (0); } return (0); } /* * Reclaim an inode so that it can be used for other purposes. */ int hpfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); dprintf(("hpfs_reclaim(0x%x0): \n", hp->h_no)); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); vfs_hash_remove(vp); mtx_destroy(&hp->h_interlock); vp->v_data = NULL; FREE(hp, M_HPFSNO); return (0); } static int hpfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); printf("\tino 0x%x\n", hp->h_no); return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the hpfs_hpbmap operation may not * deadlock on memory. See hpfs_bmap() for details. XXXXXXX (not impl) */ int hpfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(ap->a_vp); daddr_t blkno; struct bufobj *bo; int error; dprintf(("hpfs_strategy(): \n")); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("hpfs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = hpfs_hpbmap (hp, bp->b_lblkno, &blkno, NULL); bp->b_blkno = blkno; if (error) { printf("hpfs_strategy: hpfs_bpbmap FAILED %d\n", error); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = hp->h_hpmp->hpm_bo; BO_STRATEGY(bo, bp); return (0); } /* * XXXXX do we need hpfsnode locking inside? */ int hpfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct hpfsnode *hp = VTOHP(vp); mode_t mode = ap->a_mode; dprintf(("hpfs_access(0x%x):\n", hp->h_no)); /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; } } return (vaccess(vp->v_type, hp->h_mode, hp->h_uid, hp->h_gid, ap->a_mode, ap->a_cred, NULL)); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int hpfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { #ifdef HPFS_DEBUG register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); printf("hpfs_open(0x%x):\n",hp->h_no); #endif /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int hpfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { #ifdef HPFS_DEBUG register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); printf("hpfs_close: %d\n",hp->h_no); #endif return (0); } static int hpfs_de_uiomove ( struct hpfsmount *hpmp, struct hpfsdirent *dep, struct uio *uio) { struct dirent cde; int i, error; dprintf(("[no: 0x%x, size: %d, name: %2d:%.*s, flag: 0x%x] ", dep->de_fnode, dep->de_size, dep->de_namelen, dep->de_namelen, dep->de_name, dep->de_flag)); /*strncpy(cde.d_name, dep->de_name, dep->de_namelen);*/ for (i=0; ide_namelen; i++) cde.d_name[i] = hpfs_d2u(hpmp, dep->de_name[i]); cde.d_name[dep->de_namelen] = '\0'; cde.d_namlen = dep->de_namelen; cde.d_fileno = dep->de_fnode; cde.d_type = (dep->de_flag & DE_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if (error) return (error); dprintf(("[0x%x] ", uio->uio_resid)); return (error); } static struct dirent hpfs_de_dot = { 0, sizeof(struct dirent), DT_DIR, 1, "." }; static struct dirent hpfs_de_dotdot = { 0, sizeof(struct dirent), DT_DIR, 2, ".." }; int hpfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); struct hpfsmount *hpmp = hp->h_hpmp; struct uio *uio = ap->a_uio; int ncookies = 0, i, num, cnum; int error = 0; off_t off; struct buf *bp; struct dirblk *dp; struct hpfsdirent *dep; lsn_t olsn; lsn_t lsn; int level; dprintf(("hpfs_readdir(0x%x, 0x%x, 0x%x): ",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; if( uio->uio_offset < sizeof(struct dirent) ) { dprintf((". faked, ")); hpfs_de_dot.d_fileno = hp->h_no; error = uiomove((char *)&hpfs_de_dot,sizeof(struct dirent),uio); if(error) { return (error); } ncookies ++; } if( uio->uio_offset < 2 * sizeof(struct dirent) ) { dprintf((".. faked, ")); hpfs_de_dotdot.d_fileno = hp->h_fn.fn_parent; error = uiomove((char *)&hpfs_de_dotdot, sizeof(struct dirent), uio); if(error) { return (error); } ncookies ++; } num = uio->uio_offset / sizeof(struct dirent) - 2; cnum = 0; lsn = ((alleaf_t *)hp->h_fn.fn_abd)->al_lsn; olsn = 0; level = 1; dive: dprintf(("[dive 0x%x] ", lsn)); error = bread(hp->h_devvp, lsn, D_BSIZE, NOCRED, &bp); if (error) { brelse(bp); return (error); } dp = (struct dirblk *) bp->b_data; if (dp->d_magic != D_MAGIC) { printf("hpfs_readdir: MAGIC DOESN'T MATCH\n"); brelse(bp); return (EINVAL); } dep = D_DIRENT(dp); if (olsn) { dprintf(("[restore 0x%x] ", olsn)); while(!(dep->de_flag & DE_END) ) { if((dep->de_flag & DE_DOWN) && (olsn == DE_DOWNLSN(dep))) break; dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen); } if((dep->de_flag & DE_DOWN) && (olsn == DE_DOWNLSN(dep))) { if (dep->de_flag & DE_END) goto blockdone; if (!(dep->de_flag & DE_SPECIAL)) { if (num <= cnum) { if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } error = hpfs_de_uiomove(hpmp, dep, uio); if (error) { brelse (bp); return (error); } ncookies++; if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } } cnum++; } dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen); } else { printf("hpfs_readdir: ERROR! oLSN not found\n"); brelse(bp); return (EINVAL); } } olsn = 0; while(!(dep->de_flag & DE_END)) { if(dep->de_flag & DE_DOWN) { lsn = DE_DOWNLSN(dep); brelse(bp); level++; goto dive; } if (!(dep->de_flag & DE_SPECIAL)) { if (num <= cnum) { if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } error = hpfs_de_uiomove(hpmp, dep, uio); if (error) { brelse (bp); return (error); } ncookies++; if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } } cnum++; } dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen); } if(dep->de_flag & DE_DOWN) { dprintf(("[enddive] ")); lsn = DE_DOWNLSN(dep); brelse(bp); level++; goto dive; } blockdone: dprintf(("[EOB] ")); olsn = lsn; lsn = dp->d_parent; brelse(bp); level--; dprintf(("[level %d] ", level)); if (level > 0) goto dive; /* undive really */ if (ap->a_eofflag) { dprintf(("[EOF] ")); *ap->a_eofflag = 1; } readdone: dprintf(("[readdone]\n")); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; u_long *cookies; u_long *cookiep; dprintf(("%d cookies, ",ncookies)); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("hpfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } return (0); } int hpfs_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct hpfsnode *dhp = VTOHP(dvp); struct hpfsmount *hpmp = dhp->h_hpmp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; dprintf(("hpfs_lookup(0x%x, %s, %ld):\n", dhp->h_no, cnp->cn_nameptr, cnp->cn_namelen)); if (nameiop != CREATE && nameiop != DELETE && nameiop != LOOKUP) { printf("hpfs_lookup: LOOKUP, DELETE and CREATE are only supported\n"); return (EOPNOTSUPP); } error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_thread); if(error) return (error); if( (cnp->cn_namelen == 1) && !strncmp(cnp->cn_nameptr,".",1) ) { dprintf(("hpfs_lookup(0x%x,...): . faked\n",dhp->h_no)); VREF(dvp); *ap->a_vpp = dvp; return (0); } else if( (cnp->cn_namelen == 2) && !strncmp(cnp->cn_nameptr,"..",2) && (flags & ISDOTDOT) ) { dprintf(("hpfs_lookup(0x%x,...): .. faked (0x%x)\n", dhp->h_no, dhp->h_fn.fn_parent)); if (VFS_VGET(hpmp->hpm_mp, dhp->h_fn.fn_parent, LK_NOWAIT | LK_EXCLUSIVE, ap->a_vpp)) { VOP_UNLOCK(dvp,0,cnp->cn_thread); error = VFS_VGET(hpmp->hpm_mp, dhp->h_fn.fn_parent, LK_EXCLUSIVE, ap->a_vpp); - vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY, cnp->cn_thread); + vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY); if (error) return(error); } return (0); } else { struct buf *bp; struct hpfsdirent *dep; struct hpfsnode *hp; error = hpfs_genlookupbyname(dhp, cnp->cn_nameptr, cnp->cn_namelen, &bp, &dep); if (error) { if ((error == ENOENT) && (flags & ISLASTCN) && (nameiop == CREATE || nameiop == RENAME)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (error); } dprintf(("hpfs_lookup: fnode: 0x%x, CPID: 0x%x\n", dep->de_fnode, dep->de_cpid)); if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_thread); if (error) { brelse(bp); return (error); } } if (dhp->h_no == dep->de_fnode) { brelse(bp); VREF(dvp); *ap->a_vpp = dvp; return (0); } error = VFS_VGET(hpmp->hpm_mp, dep->de_fnode, LK_EXCLUSIVE, ap->a_vpp); if (error) { printf("hpfs_lookup: VFS_VGET FAILED %d\n", error); brelse(bp); return(error); } hp = VTOHP(*ap->a_vpp); hp->h_mtime = dep->de_mtime; hp->h_ctime = dep->de_ctime; hp->h_atime = dep->de_atime; bcopy(dep->de_name, hp->h_name, dep->de_namelen); hp->h_name[dep->de_namelen] = '\0'; hp->h_namelen = dep->de_namelen; hp->h_flag |= H_PARVALID; brelse(bp); if ((flags & MAKEENTRY) && (!(flags & ISLASTCN) || (nameiop != DELETE && nameiop != CREATE))) cache_enter(dvp, *ap->a_vpp, cnp); } return (error); } int hpfs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { int error; dprintf(("hpfs_remove(0x%x, %s, %ld): \n", VTOHP(ap->a_vp)->h_no, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); if (ap->a_vp->v_type == VDIR) return (EPERM); error = hpfs_removefnode (ap->a_dvp, ap->a_vp, ap->a_cnp); return (error); } int hpfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; dprintf(("hpfs_create(0x%x, %s, %ld): \n", VTOHP(ap->a_dvp)->h_no, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); if (!(ap->a_cnp->cn_flags & HASBUF)) panic ("hpfs_create: no name\n"); error = hpfs_makefnode (ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap); return (error); } /* * Return POSIX pathconf information applicable to NTFS filesystem */ int hpfs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = HPFS_MAXFILENAME; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } int hpfs_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; struct fid *a_fhp; } */ *ap; { register struct hpfsnode *hpp; register struct hpfid *hpfhp; hpp = VTOHP(ap->a_vp); hpfhp = (struct hpfid *)ap->a_fhp; hpfhp->hpfid_len = sizeof(struct hpfid); hpfhp->hpfid_ino = hpp->h_no; /* hpfhp->hpfid_gen = hpp->h_gen; */ return (0); } /* * Global vfs data structures */ struct vop_vector hpfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = hpfs_access, .vop_bmap = hpfs_bmap, .vop_cachedlookup = hpfs_lookup, .vop_close = hpfs_close, .vop_create = hpfs_create, .vop_fsync = hpfs_fsync, .vop_getattr = hpfs_getattr, .vop_inactive = hpfs_inactive, .vop_ioctl = hpfs_ioctl, .vop_lookup = vfs_cache_lookup, .vop_open = hpfs_open, .vop_pathconf = hpfs_pathconf, .vop_print = hpfs_print, .vop_read = hpfs_read, .vop_readdir = hpfs_readdir, .vop_reclaim = hpfs_reclaim, .vop_remove = hpfs_remove, .vop_setattr = hpfs_setattr, .vop_strategy = hpfs_strategy, .vop_write = hpfs_write, .vop_vptofh = hpfs_vptofh, }; Index: head/sys/fs/msdosfs/msdosfs_lookup.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_lookup.c (revision 175201) +++ head/sys/fs/msdosfs/msdosfs_lookup.c (revision 175202) @@ -1,1088 +1,1088 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include /* * When we search a directory the blocks containing directory entries are * read and examined. The directory entries contain information that would * normally be in the inode of a unix filesystem. This means that some of * a directory's contents may also be in memory resident denodes (sort of * an inode). This can cause problems if we are searching while some other * process is modifying a directory. To prevent one process from accessing * incompletely modified directory information we depend upon being the * sole owner of a directory block. bread/brelse provide this service. * This being the case, when a process modifies a directory it must first * acquire the disk block that contains the directory entry to be modified. * Then update the disk block and the denode, and then write the disk block * out to disk. This way disk blocks containing directory entries and in * memory denode's will be in synch. */ int msdosfs_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct mbnambuf nb; struct vnode *vdp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; daddr_t bn; int error; int slotcount; int slotoffset = 0; int frcn; u_long cluster; int blkoff; int diroff; int blsize; int isadir; /* ~0 if found direntry is a directory */ u_long scn; /* starting cluster number */ struct vnode *pdp; struct denode *dp; struct denode *tdp; struct msdosfsmount *pmp; struct buf *bp = 0; struct direntry *dep = NULL; u_char dosfilename[12]; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; struct thread *td = cnp->cn_thread; int unlen; int wincnt = 1; int chksum = -1, chksum_ok; int olddos = 1; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): looking for %s\n", cnp->cn_nameptr); #endif dp = VTODE(vdp); pmp = dp->de_pmp; *vpp = NULL; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): vdp %p, dp %p, Attr %02x\n", vdp, dp, dp->de_Attributes); #endif /* * If they are going after the . or .. entry in the root directory, * they won't find it. DOS filesystems don't have them in the root * directory. So, we fake it. deget() is in on this scam too. */ if ((vdp->v_vflag & VV_ROOT) && cnp->cn_nameptr[0] == '.' && (cnp->cn_namelen == 1 || (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.'))) { isadir = ATTR_DIRECTORY; scn = MSDOSFSROOT; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): looking for . or .. in root directory\n"); #endif cluster = MSDOSFSROOT; blkoff = MSDOSFSROOT_OFS; goto foundroot; } switch (unix2dosfn((const u_char *)cnp->cn_nameptr, dosfilename, cnp->cn_namelen, 0, pmp)) { case 0: return (EINVAL); case 1: break; case 2: wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr, cnp->cn_namelen, pmp) + 1; break; case 3: olddos = 0; wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr, cnp->cn_namelen, pmp) + 1; break; } if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) { wincnt = 1; olddos = 1; } unlen = winLenFixup(cnp->cn_nameptr, cnp->cn_namelen); /* * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ slotcount = wincnt; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) slotcount = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): dos version of filename %s, length %ld\n", dosfilename, cnp->cn_namelen); #endif /* * Search the directory pointed at by vdp for the name pointed at * by cnp->cn_nameptr. */ tdp = NULL; mbnambuf_init(&nb); /* * The outer loop ranges over the clusters that make up the * directory. Note that the root directory is different from all * other directories. It has a fixed number of blocks that are not * part of the pool of allocatable clusters. So, we treat it a * little differently. The root directory starts at "cluster" 0. */ diroff = 0; for (frcn = 0;; frcn++) { error = pcbmap(dp, frcn, &bn, &cluster, &blsize); if (error) { if (error == E2BIG) break; return (error); } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } for (blkoff = 0; blkoff < blsize; blkoff += sizeof(struct direntry), diroff += sizeof(struct direntry)) { dep = (struct direntry *)(bp->b_data + blkoff); /* * If the slot is empty and we are still looking * for an empty then remember this one. If the * slot is not empty then check to see if it * matches what we are looking for. If the slot * has never been filled with anything, then the * remainder of the directory has never been used, * so there is no point in searching it. */ if (dep->deName[0] == SLOT_EMPTY || dep->deName[0] == SLOT_DELETED) { /* * Drop memory of previous long matches */ chksum = -1; mbnambuf_init(&nb); if (slotcount < wincnt) { slotcount++; slotoffset = diroff; } if (dep->deName[0] == SLOT_EMPTY) { brelse(bp); goto notfound; } } else { /* * If there wasn't enough space for our winentries, * forget about the empty space */ if (slotcount < wincnt) slotcount = 0; /* * Check for Win95 long filename entry */ if (dep->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn(&nb, (struct winentry *)dep, chksum, pmp); continue; } chksum = winChkName(&nb, (const u_char *)cnp->cn_nameptr, unlen, chksum, pmp); if (chksum == -2) { chksum = -1; continue; } /* * Ignore volume labels (anywhere, not just * the root directory). */ if (dep->deAttributes & ATTR_VOLUME) { chksum = -1; continue; } /* * Check for a checksum or name match */ chksum_ok = (chksum == winChksum(dep)); if (!chksum_ok && (!olddos || bcmp(dosfilename, dep->deName, 11))) { chksum = -1; continue; } #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): match blkoff %d, diroff %d\n", blkoff, diroff); #endif /* * Remember where this directory * entry came from for whoever did * this lookup. */ dp->de_fndoffset = diroff; if (chksum_ok && nameiop == RENAME) { /* * Target had correct long name * directory entries, reuse them * as needed. */ dp->de_fndcnt = wincnt - 1; } else { /* * Long name directory entries * not present or corrupt, can only * reuse dos directory entry. */ dp->de_fndcnt = 0; } goto found; } } /* for (blkoff = 0; .... */ /* * Release the buffer holding the directory cluster just * searched. */ brelse(bp); } /* for (frcn = 0; ; frcn++) */ notfound: /* * We hold no disk buffers at this point. */ /* * Fixup the slot description to point to the place where * we might put the new DOS direntry (putting the Win95 * long name entries before that) */ if (!slotcount) { slotcount = 1; slotoffset = diroff; } if (wincnt > slotcount) slotoffset += sizeof(struct direntry) * (wincnt - slotcount); /* * If we get here we didn't find the entry we were looking for. But * that's ok if we are creating or renaming and are at the end of * the pathname and the directory hasn't been removed. */ #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): op %d, refcnt %ld\n", nameiop, dp->de_refcnt); printf(" slotcount %d, slotoffset %d\n", slotcount, slotoffset); #endif if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN) && dp->de_refcnt != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. */ error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* * Return an indication of where the new directory * entry should be put. */ dp->de_fndoffset = slotoffset; dp->de_fndcnt = wincnt - 1; /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * The pathname buffer is saved so that the name * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } #if 0 /* * Insert name into cache (as non-existent) if appropriate. * * XXX Negative caching is broken for msdosfs because the name * cache doesn't understand peculiarities such as case insensitivity * and 8.3 filenames. Hence, it may not invalidate all negative * entries if a file with this name is later created. */ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(vdp, *vpp, cnp); #endif return (ENOENT); found: /* * NOTE: We still have the buffer with matched directory entry at * this point. */ isadir = dep->deAttributes & ATTR_DIRECTORY; scn = getushort(dep->deStartCluster); if (FAT32(pmp)) { scn |= getushort(dep->deHighClust) << 16; if (scn == pmp->pm_rootdirblk) { /* * There should actually be 0 here. * Just ignore the error. */ scn = MSDOSFSROOT; } } if (isadir) { cluster = scn; if (cluster == MSDOSFSROOT) blkoff = MSDOSFSROOT_OFS; else blkoff = 0; } else if (cluster == MSDOSFSROOT) blkoff = diroff; /* * Now release buf to allow deget to read the entry again. * Reserving it here and giving it to deget could result * in a deadlock. */ brelse(bp); bp = 0; foundroot: /* * If we entered at foundroot, then we are looking for the . or .. * entry of the filesystems root directory. isadir and scn were * setup before jumping here. And, bp is already null. */ if (FAT32(pmp) && scn == MSDOSFSROOT) scn = pmp->pm_rootdirblk; /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { /* * Don't allow deleting the root. */ if (blkoff == MSDOSFSROOT_OFS) return EROFS; /* really? XXX */ /* * Write access to directory required to delete files. */ error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* * Return pointer to current entry in dp->i_offset. * Save directory inode pointer in ndp->ni_dvp for dirremove(). */ if (dp->de_StartCluster == scn && isadir) { /* "." */ VREF(vdp); *vpp = vdp; return (0); } error = deget(pmp, cluster, blkoff, &tdp); if (error) return (error); *vpp = DETOV(tdp); return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if (blkoff == MSDOSFSROOT_OFS) return EROFS; /* really? XXX */ error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ if (dp->de_StartCluster == scn && isadir) return (EISDIR); if ((error = deget(pmp, cluster, blkoff, &tdp)) != 0) return (error); *vpp = DETOV(tdp); cnp->cn_flags |= SAVENAME; return (0); } /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { VOP_UNLOCK(pdp, 0, td); error = deget(pmp, cluster, blkoff, &tdp); - vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); *vpp = DETOV(tdp); } else if (dp->de_StartCluster == scn && isadir) { VREF(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { if ((error = deget(pmp, cluster, blkoff, &tdp)) != 0) return (error); *vpp = DETOV(tdp); } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } /* * dep - directory entry to copy into the directory * ddep - directory to add to * depp - return the address of the denode for the created directory entry * if depp != 0 * cnp - componentname needed for Win95 long filenames */ int createde(dep, ddep, depp, cnp) struct denode *dep; struct denode *ddep; struct denode **depp; struct componentname *cnp; { int error; u_long dirclust, diroffset; struct direntry *ndep; struct msdosfsmount *pmp = ddep->de_pmp; struct buf *bp; daddr_t bn; int blsize; #ifdef MSDOSFS_DEBUG printf("createde(dep %p, ddep %p, depp %p, cnp %p)\n", dep, ddep, depp, cnp); #endif /* * If no space left in the directory then allocate another cluster * and chain it onto the end of the file. There is one exception * to this. That is, if the root directory has no more space it * can NOT be expanded. extendfile() checks for and fails attempts * to extend the root directory. We just return an error in that * case. */ if (ddep->de_fndoffset >= ddep->de_FileSize) { diroffset = ddep->de_fndoffset + sizeof(struct direntry) - ddep->de_FileSize; dirclust = de_clcount(pmp, diroffset); error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR); if (error) { (void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED, NULL); return error; } /* * Update the size of the directory */ ddep->de_FileSize += de_cn2off(pmp, dirclust); } /* * We just read in the cluster with space. Copy the new directory * entry in. Then write it to disk. NOTE: DOS directories * do not get smaller as clusters are emptied. */ error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset), &bn, &dirclust, &blsize); if (error) return error; diroffset = ddep->de_fndoffset; if (dirclust != MSDOSFSROOT) diroffset &= pmp->pm_crbomask; if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) != 0) { brelse(bp); return error; } ndep = bptoep(pmp, bp, ddep->de_fndoffset); DE_EXTERNALIZE(ndep, dep); /* * Now write the Win95 long name */ if (ddep->de_fndcnt > 0) { u_int8_t chksum = winChksum(ndep); const u_char *un = (const u_char *)cnp->cn_nameptr; int unlen = cnp->cn_namelen; int cnt = 1; while (--ddep->de_fndcnt >= 0) { if (!(ddep->de_fndoffset & pmp->pm_crbomask)) { if (DETOV(ddep)->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else if ((error = bwrite(bp)) != 0) return error; ddep->de_fndoffset -= sizeof(struct direntry); error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset), &bn, 0, &blsize); if (error) return error; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return error; } ndep = bptoep(pmp, bp, ddep->de_fndoffset); } else { ndep--; ddep->de_fndoffset -= sizeof(struct direntry); } if (!unix2winfn(un, unlen, (struct winentry *)ndep, cnt++, chksum, pmp)) break; } } if (DETOV(ddep)->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else if ((error = bwrite(bp)) != 0) return error; /* * If they want us to return with the denode gotten. */ if (depp) { if (dep->de_Attributes & ATTR_DIRECTORY) { dirclust = dep->de_StartCluster; if (FAT32(pmp) && dirclust == pmp->pm_rootdirblk) dirclust = MSDOSFSROOT; if (dirclust == MSDOSFSROOT) diroffset = MSDOSFSROOT_OFS; else diroffset = 0; } return deget(pmp, dirclust, diroffset, depp); } return 0; } /* * Be sure a directory is empty except for "." and "..". Return 1 if empty, * return 0 if not empty or error. */ int dosdirempty(dep) struct denode *dep; { int blsize; int error; u_long cn; daddr_t bn; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; /* * Since the filesize field in directory entries for a directory is * zero, we just have to feel our way through the directory until * we hit end of file. */ for (cn = 0;; cn++) { if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) { if (error == E2BIG) return (1); /* it's empty */ return (0); } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return (0); } for (dentp = (struct direntry *)bp->b_data; (char *)dentp < bp->b_data + blsize; dentp++) { if (dentp->deName[0] != SLOT_DELETED && (dentp->deAttributes & ATTR_VOLUME) == 0) { /* * In dos directories an entry whose name * starts with SLOT_EMPTY (0) starts the * beginning of the unused part of the * directory, so we can just return that it * is empty. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); return (1); } /* * Any names other than "." and ".." in a * directory mean it is not empty. */ if (bcmp(dentp->deName, ". ", 11) && bcmp(dentp->deName, ".. ", 11)) { brelse(bp); #ifdef MSDOSFS_DEBUG printf("dosdirempty(): entry found %02x, %02x\n", dentp->deName[0], dentp->deName[1]); #endif return (0); /* not empty */ } } } brelse(bp); } /* NOTREACHED */ } /* * Check to see if the directory described by target is in some * subdirectory of source. This prevents something like the following from * succeeding and leaving a bunch or files and directories orphaned. mv * /a/b/c /a/b/c/d/e/f Where c and f are directories. * * source - the inode for /a/b/c * target - the inode for /a/b/c/d/e/f * * Returns 0 if target is NOT a subdirectory of source. * Otherwise returns a non-zero error number. * The target inode is always unlocked on return. */ int doscheckpath(source, target) struct denode *source; struct denode *target; { daddr_t scn; struct msdosfsmount *pmp; struct direntry *ep; struct denode *dep; struct buf *bp = NULL; int error = 0; dep = target; if ((target->de_Attributes & ATTR_DIRECTORY) == 0 || (source->de_Attributes & ATTR_DIRECTORY) == 0) { error = ENOTDIR; goto out; } if (dep->de_StartCluster == source->de_StartCluster) { error = EEXIST; goto out; } if (dep->de_StartCluster == MSDOSFSROOT) goto out; pmp = dep->de_pmp; #ifdef DIAGNOSTIC if (pmp != source->de_pmp) panic("doscheckpath: source and target on different filesystems"); #endif if (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk) goto out; for (;;) { if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) { error = ENOTDIR; break; } scn = dep->de_StartCluster; error = bread(pmp->pm_devvp, cntobn(pmp, scn), pmp->pm_bpcluster, NOCRED, &bp); if (error) break; ep = (struct direntry *) bp->b_data + 1; if ((ep->deAttributes & ATTR_DIRECTORY) == 0 || bcmp(ep->deName, ".. ", 11) != 0) { error = ENOTDIR; break; } scn = getushort(ep->deStartCluster); if (FAT32(pmp)) scn |= getushort(ep->deHighClust) << 16; if (scn == source->de_StartCluster) { error = EINVAL; break; } if (scn == MSDOSFSROOT) break; if (FAT32(pmp) && scn == pmp->pm_rootdirblk) { /* * scn should be 0 in this case, * but we silently ignore the error. */ break; } vput(DETOV(dep)); brelse(bp); bp = NULL; /* NOTE: deget() clears dep on error */ if ((error = deget(pmp, scn, 0, &dep)) != 0) break; } out:; if (bp) brelse(bp); if (error == ENOTDIR) printf("doscheckpath(): .. not a directory?\n"); if (dep != NULL) vput(DETOV(dep)); return (error); } /* * Read in the disk block containing the directory entry (dirclu, dirofs) * and return the address of the buf header, and the address of the * directory entry within the block. */ int readep(pmp, dirclust, diroffset, bpp, epp) struct msdosfsmount *pmp; u_long dirclust, diroffset; struct buf **bpp; struct direntry **epp; { int error; daddr_t bn; int blsize; blsize = pmp->pm_bpcluster; if (dirclust == MSDOSFSROOT && de_blk(pmp, diroffset + blsize) > pmp->pm_rootdirsize) blsize = de_bn2off(pmp, pmp->pm_rootdirsize) & pmp->pm_crbomask; bn = detobn(pmp, dirclust, diroffset); if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, bpp)) != 0) { brelse(*bpp); *bpp = NULL; return (error); } if (epp) *epp = bptoep(pmp, *bpp, diroffset); return (0); } /* * Read in the disk block containing the directory entry dep came from and * return the address of the buf header, and the address of the directory * entry within the block. */ int readde(dep, bpp, epp) struct denode *dep; struct buf **bpp; struct direntry **epp; { return (readep(dep->de_pmp, dep->de_dirclust, dep->de_diroffset, bpp, epp)); } /* * Remove a directory entry. At this point the file represented by the * directory entry to be removed is still full length until noone has it * open. When the file no longer being used msdosfs_inactive() is called * and will truncate the file to 0 length. When the vnode containing the * denode is needed for some other purpose by VFS it will call * msdosfs_reclaim() which will remove the denode from the denode cache. */ int removede(pdep, dep) struct denode *pdep; /* directory where the entry is removed */ struct denode *dep; /* file to be removed */ { int error; struct direntry *ep; struct buf *bp; daddr_t bn; int blsize; struct msdosfsmount *pmp = pdep->de_pmp; u_long offset = pdep->de_fndoffset; #ifdef MSDOSFS_DEBUG printf("removede(): filename %s, dep %p, offset %08lx\n", dep->de_Name, dep, offset); #endif dep->de_refcnt--; offset += sizeof(struct direntry); do { offset -= sizeof(struct direntry); error = pcbmap(pdep, de_cluster(pmp, offset), &bn, 0, &blsize); if (error) return error; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return error; } ep = bptoep(pmp, bp, offset); /* * Check whether, if we came here the second time, i.e. * when underflowing into the previous block, the last * entry in this block is a longfilename entry, too. */ if (ep->deAttributes != ATTR_WIN95 && offset != pdep->de_fndoffset) { brelse(bp); break; } offset += sizeof(struct direntry); while (1) { /* * We are a bit agressive here in that we delete any Win95 * entries preceding this entry, not just the ones we "own". * Since these presumably aren't valid anyway, * there should be no harm. */ offset -= sizeof(struct direntry); ep--->deName[0] = SLOT_DELETED; if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) || !(offset & pmp->pm_crbomask) || ep->deAttributes != ATTR_WIN95) break; } if (DETOV(pdep)->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else if ((error = bwrite(bp)) != 0) return error; } while (!(pmp->pm_flags & MSDOSFSMNT_NOWIN95) && !(offset & pmp->pm_crbomask) && offset); return 0; } /* * Create a unique DOS name in dvp */ int uniqdosname(dep, cnp, cp) struct denode *dep; struct componentname *cnp; u_char *cp; { struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; int gen; int blsize; u_long cn; daddr_t bn; struct buf *bp; int error; if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) return (unix2dosfn((const u_char *)cnp->cn_nameptr, cp, cnp->cn_namelen, 0, pmp) ? 0 : EINVAL); for (gen = 1;; gen++) { /* * Generate DOS name with generation number */ if (!unix2dosfn((const u_char *)cnp->cn_nameptr, cp, cnp->cn_namelen, gen, pmp)) return gen == 1 ? EINVAL : EEXIST; /* * Now look for a dir entry with this exact name */ for (cn = error = 0; !error; cn++) { if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) { if (error == E2BIG) /* EOF reached and not found */ return 0; return error; } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return error; } for (dentp = (struct direntry *)bp->b_data; (char *)dentp < bp->b_data + blsize; dentp++) { if (dentp->deName[0] == SLOT_EMPTY) { /* * Last used entry and not found */ brelse(bp); return 0; } /* * Ignore volume labels and Win95 entries */ if (dentp->deAttributes & ATTR_VOLUME) continue; if (!bcmp(dentp->deName, cp, 11)) { error = EEXIST; break; } } brelse(bp); } } } /* * Find any Win'95 long filename entry in directory dep */ int findwin95(dep) struct denode *dep; { struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; int blsize, win95; u_long cn; daddr_t bn; struct buf *bp; win95 = 1; /* * Read through the directory looking for Win'95 entries * Note: Error currently handled just as EOF XXX */ for (cn = 0;; cn++) { if (pcbmap(dep, cn, &bn, 0, &blsize)) return (win95); if (bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) { brelse(bp); return (win95); } for (dentp = (struct direntry *)bp->b_data; (char *)dentp < bp->b_data + blsize; dentp++) { if (dentp->deName[0] == SLOT_EMPTY) { /* * Last used entry and not found */ brelse(bp); return (win95); } if (dentp->deName[0] == SLOT_DELETED) { /* * Ignore deleted files * Note: might be an indication of Win'95 anyway XXX */ continue; } if (dentp->deAttributes == ATTR_WIN95) { brelse(bp); return 1; } win95 = 0; } brelse(bp); } } Index: head/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 175201) +++ head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 175202) @@ -1,969 +1,969 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Mount options that we support. */ static const char *msdosfs_opts[] = { "async", "noatime", "noclusterr", "noclusterw", "export", "force", "from", "sync", "cs_dos", "cs_local", "cs_win", "dirmask", "gid", "kiconv", "large", "longname", "longnames", "mask", "shortname", "shortnames", "uid", "win95", "nowin95", NULL }; #if 1 /*def PC98*/ /* * XXX - The boot signature formatted by NEC PC-98 DOS looks like a * garbage or a random value :-{ * If you want to use that broken-signatured media, define the * following symbol even though PC/AT. * (ex. mount PC-98 DOS formatted FD on PC/AT) */ #define MSDOSFS_NOCHECKSIG #endif MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table"); struct iconv_functions *msdosfs_iconv; static int update_mp(struct mount *mp, struct thread *td); static int mountmsdosfs(struct vnode *devvp, struct mount *mp, struct thread *td); static vfs_fhtovp_t msdosfs_fhtovp; static vfs_mount_t msdosfs_mount; static vfs_root_t msdosfs_root; static vfs_statfs_t msdosfs_statfs; static vfs_sync_t msdosfs_sync; static vfs_unmount_t msdosfs_unmount; /* Maximum length of a character set name (arbitrary). */ #define MAXCSLEN 64 static int update_mp(struct mount *mp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); void *dos, *win, *local; int error, v; if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) { if (msdosfs_iconv != NULL) { error = vfs_getopt(mp->mnt_optnew, "cs_win", &win, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_local", &local, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_dos", &dos, NULL); if (!error) { msdosfs_iconv->open(win, local, &pmp->pm_u2w); msdosfs_iconv->open(local, win, &pmp->pm_w2u); msdosfs_iconv->open(dos, local, &pmp->pm_u2d); msdosfs_iconv->open(local, dos, &pmp->pm_d2u); } if (error != 0) return (error); } else { pmp->pm_w2u = NULL; pmp->pm_u2w = NULL; pmp->pm_d2u = NULL; pmp->pm_u2d = NULL; } } if (1 == vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v)) pmp->pm_gid = v; if (1 == vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v)) pmp->pm_uid = v; if (1 == vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v)) pmp->pm_mask = v & ALLPERMS; if (1 == vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v)) pmp->pm_dirmask = v & ALLPERMS; vfs_flagopt(mp->mnt_optnew, "shortname", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "shortnames", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "longname", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "longnames", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "kiconv", &pmp->pm_flags, MSDOSFSMNT_KICONV); if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; else pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else if (!(pmp->pm_flags & (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) { struct vnode *rootvp; /* * Try to divine whether to support Win'95 long filenames */ if (FAT32(pmp)) pmp->pm_flags |= MSDOSFSMNT_LONGNAME; else { if ((error = msdosfs_root(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) return error; pmp->pm_flags |= findwin95(VTODE(rootvp)) ? MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME; vput(rootvp); } } return 0; } static int msdosfs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct msdosfs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mask", "%d", args.mask); ma = mount_argf(ma, "dirmask", "%d", args.dirmask); ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname"); ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv"); ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN); ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN); ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN); error = kernel_mount(ma, flags); return (error); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(struct mount *mp, struct thread *td) { struct vnode *devvp; /* vnode for blk device to mount */ /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; struct nameidata ndp; int error, flags; mode_t accessmode; char *from; if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts)) return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) { /* * Forbid export requests if filesystem has * MSDOSFS_LARGEFS flag set. */ if ((pmp->pm_flags & MSDOSFS_LARGEFS) != 0) { vfs_mount_error(mp, "MSDOSFS_LARGEFS flag set, cannot export"); return (EOPNOTSUPP); } } if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT, td); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return (error); /* * Now the volume is clean. Mark it so while the * device is still rw. */ error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* Downgrade the device from rw to ro. */ DROP_GIANT(); g_topology_lock(); error = g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* * Backing out after an error was painful in the * above. Now we are committed to succeeding. */ pmp->pm_fmod = 0; pmp->pm_flags |= MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); DROP_GIANT(); g_topology_lock(); error = g_access(pmp->pm_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); pmp->pm_fmod = 1; pmp->pm_flags &= ~MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); /* Now that the volume is modifiable, mark it dirty. */ error = markvoldirty(pmp, 1); if (error) return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); error = namei(&ndp); if (error) return (error); devvp = ndp.ni_vp; NDFREE(&ndp, NDF_ONLY_PNBUF); if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp, td); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { if (devvp != pmp->pm_devvp) error = EINVAL; /* XXX needs translation */ else vput(devvp); } if (error) { vrele(devvp); return (error); } error = update_mp(mp, td); if (error) { if ((mp->mnt_flag & MNT_UPDATE) == 0) msdosfs_unmount(mp, MNT_FORCE, td); return error; } vfs_mountedfrom(mp, from); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(struct vnode *devvp, struct mount *mp, struct thread *td) { struct msdosfsmount *pmp; struct buf *bp; struct cdev *dev = devvp->v_rdev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; u_int8_t SecPerClust; u_long clusters; int ronly, error; struct g_consumer *cp; struct bufobj *bo; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; /* XXX: use VOP_ACCESS to check FS perms */ DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); bo = &devvp->v_bufobj; bp = NULL; /* This and pmp both used in error_exit. */ pmp = NULL; /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. * * NOTE: 8192 is a magic size that works for ffs. */ error = bread(devvp, 0, 8192, NOCRED, &bp); if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #ifndef MSDOSFS_NOCHECKSIG if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { error = EINVAL; goto error_exit; } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO); pmp->pm_mountp = mp; pmp->pm_cp = cp; pmp->pm_bo = bo; /* * Initialize ownerships and permissions, since nothing else will * initialize them iff we are mounting root. */ pmp->pm_uid = UID_ROOT; pmp->pm_gid = GID_WHEEL; pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH | S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR; /* * Experimental support for large MS-DOS filesystems. * WARNING: This uses at least 32 bytes of kernel memory (which is not * reclaimed until the FS is unmounted) for each file on disk to map * between the 32-bit inode numbers used by VFS and the 64-bit * pseudo-inode numbers used internally by msdosfs. This is only * safe to use in certain controlled situations (e.g. read-only FS * with less than 1 million files). * Since the mappings do not persist across unmounts (or reboots), these * filesystems are not suitable for exporting through NFS, or any other * application that requires fixed inode numbers. */ vfs_flagopt(mp->mnt_optnew, "large", &pmp->pm_flags, MSDOSFS_LARGEFS); /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); if (pmp->pm_BytesPerSec < DEV_BSIZE) { error = EINVAL; goto error_exit; } pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; /* calculate the ratio of sector size to DEV_BSIZE */ pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE; /* XXX - We should probably check more values here */ if (!pmp->pm_BytesPerSec || !SecPerClust || !pmp->pm_Heads #ifdef PC98 || !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 255) { #else || !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 63) { #endif error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (!(pmp->pm_flags & MSDOSFS_LARGEFS)) { if (pmp->pm_HugeSectors > 0xffffffff / (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) { /* * We cannot deal currently with this size of disk * due to fileid limitations (see msdosfs_getattr and * msdosfs_readdir) */ error = EINVAL; vfs_mount_error(mp, "Disk too big, try '-o large' mount option"); goto error_exit; } } if (pmp->pm_RootDirEnts == 0) { if (pmp->pm_Sectors || pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; printf("mountmsdosfs(): bad FAT32 filesystem\n"); goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition * - number of FAT sectors: >= 1 */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < DEV_BSIZE) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_FATsecs == 0) ) { error = EINVAL; goto error_exit; } pmp->pm_HugeSectors *= pmp->pm_BlkPerSec; pmp->pm_HiddenSects *= pmp->pm_BlkPerSec; /* XXX not used? */ pmp->pm_FATsecs *= pmp->pm_BlkPerSec; SecPerClust *= pmp->pm_BlkPerSec; pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec; } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry) + DEV_BSIZE - 1) / DEV_BSIZE; /* in blocks */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust + 1; pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */ if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one fat entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; if (pmp->pm_maxcluster >= clusters) { printf("Warning: number of clusters (%ld) exceeds FAT " "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters); pmp->pm_maxcluster = clusters - 1; } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * 512; else pmp->pm_fatblocksize = PAGE_SIZE; pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize, pmp->pm_BytesPerSec); pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE; pmp->pm_bnshift = ffs(DEV_BSIZE) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * DEV_BSIZE; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check the fsinfo sector if we have one. Silently fix up our * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff) * or too large. Ignore fp->fsinfree for now, since we need to * read the entire FAT anyway to fill the inuse map. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) { pmp->pm_nxtfree = getulong(fp->fsinxtfree); if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; } else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Finish initializing pmp->pm_nxtfree (just in case the first few * sectors aren't properly reserved in the FAT). This completes * the fixup for fp->fsinxtfree, and fixes up the zero-initialized * value if there is no fsinfo. We will use pmp->pm_nxtfree * internally even if there is no fsinfo. */ if (pmp->pm_nxtfree < CLUST_FIRST) pmp->pm_nxtfree = CLUST_FIRST; /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_devvp = devvp; /* * Have the inuse map filled in. */ if ((error = fillinusemap(pmp)) != 0) goto error_exit; /* * If they want fat updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the fat being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else { if ((error = markvoldirty(pmp, 1)) != 0) { (void)markvoldirty(pmp, 0); goto error_exit; } pmp->pm_fmod = 1; } mp->mnt_data = pmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if (pmp->pm_flags & MSDOSFS_LARGEFS) msdosfs_fileno_init(mp); return 0; error_exit: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (pmp) { if (pmp->pm_inusemap) free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } return (error); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(struct mount *mp, int mntflags, struct thread *td) { struct msdosfsmount *pmp; int error, flags; flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return error; pmp = VFSTOMSDOSFS(mp); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) { error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } } if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { if (pmp->pm_w2u) msdosfs_iconv->close(pmp->pm_w2u); if (pmp->pm_u2w) msdosfs_iconv->close(pmp->pm_u2w); if (pmp->pm_d2u) msdosfs_iconv->close(pmp->pm_d2u); if (pmp->pm_u2d) msdosfs_iconv->close(pmp->pm_u2d); } #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; VI_LOCK(vp); vn_printf(vp, "msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("freef %p, freeb %p, mount %p\n", TAILQ_NEXT(vp, v_freelist), vp->v_freelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd), TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd), vp->v_bufobj.bo_numoutput, vp->v_type); VI_UNLOCK(vp); } #endif DROP_GIANT(); g_topology_lock(); g_vfs_close(pmp->pm_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(pmp->pm_devvp); free(pmp->pm_inusemap, M_MSDOSFSFAT); if (pmp->pm_flags & MSDOSFS_LARGEFS) msdosfs_fileno_free(mp); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (0); } static int msdosfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_maxcluster + 1; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ return (0); } static int msdosfs_sync(struct mount *mp, int waitfor, struct thread *td) { struct vnode *vp, *nvp; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; /* * If we ever switch to not updating all of the fats all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update fats here */ } } /* * Write back each (modified) denode. */ MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, nvp) { VI_LOCK(vp); if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); dep = VTODE(vp); if ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { MNT_ILOCK(mp); if (error == ENOENT) goto loop; continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { - vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(pmp->pm_devvp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp, 0, td); } return (allerror); } static int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; int error; error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); vnode_create_vobject(*vpp, dep->de_FileSize, curthread); return (0); } static struct vfsops msdosfs_vfsops = { .vfs_fhtovp = msdosfs_fhtovp, .vfs_mount = msdosfs_mount, .vfs_cmount = msdosfs_cmount, .vfs_root = msdosfs_root, .vfs_statfs = msdosfs_statfs, .vfs_sync = msdosfs_sync, .vfs_unmount = msdosfs_unmount, }; VFS_SET(msdosfs_vfsops, msdosfs, 0); MODULE_VERSION(msdosfs, 1); Index: head/sys/fs/msdosfs/msdosfs_vnops.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_vnops.c (revision 175201) +++ head/sys/fs/msdosfs/msdosfs_vnops.c (revision 175202) @@ -1,2016 +1,2016 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DOS_FILESIZE_MAX 0xffffffff /* * Prototypes for MSDOSFS vnode operations */ static vop_advlock_t msdosfs_advlock; static vop_create_t msdosfs_create; static vop_mknod_t msdosfs_mknod; static vop_open_t msdosfs_open; static vop_close_t msdosfs_close; static vop_access_t msdosfs_access; static vop_getattr_t msdosfs_getattr; static vop_setattr_t msdosfs_setattr; static vop_read_t msdosfs_read; static vop_write_t msdosfs_write; static vop_fsync_t msdosfs_fsync; static vop_remove_t msdosfs_remove; static vop_link_t msdosfs_link; static vop_rename_t msdosfs_rename; static vop_mkdir_t msdosfs_mkdir; static vop_rmdir_t msdosfs_rmdir; static vop_symlink_t msdosfs_symlink; static vop_readdir_t msdosfs_readdir; static vop_bmap_t msdosfs_bmap; static vop_strategy_t msdosfs_strategy; static vop_print_t msdosfs_print; static vop_pathconf_t msdosfs_pathconf; static vop_vptofh_t msdosfs_vptofh; /* * Some general notes: * * In the ufs filesystem the inodes, superblocks, and indirect blocks are * read/written using the vnode for the filesystem. Blocks that represent * the contents of a file are read/written using the vnode for the file * (including directories when they are read/written as files). This * presents problems for the dos filesystem because data that should be in * an inode (if dos had them) resides in the directory itself. Since we * must update directory entries without the benefit of having the vnode * for the directory we must use the vnode for the filesystem. This means * that when a directory is actually read/written (via read, write, or * readdir, or seek) we must use the vnode for the filesystem instead of * the vnode for the directory as would happen in ufs. This is to insure we * retreive the correct block from the buffer cache since the hash value is * based upon the vnode address and the desired block number. */ /* * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int msdosfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct denode ndirent; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct timespec ts; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap); #endif /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad; } /* * Create a directory entry for the file, then call createde() to * have it installed. NOTE: DOS files are always executable. We * use the absence of the owner write bit to make the file * readonly. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_create: no name"); #endif bzero(&ndirent, sizeof(ndirent)); error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = (ap->a_vap->va_mode & VWRITE) ? ATTR_ARCHIVE : ATTR_ARCHIVE | ATTR_READONLY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = 0; ndirent.de_FileSize = 0; ndirent.de_dev = pdep->de_dev; ndirent.de_pmp = pdep->de_pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; *ap->a_vpp = DETOV(dep); return (0); bad: return (error); } static int msdosfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { return (EINVAL); } static int msdosfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td); return 0; } static int msdosfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct timespec ts; VI_LOCK(vp); if (vp->v_usecount > 1) { getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); } VI_UNLOCK(vp); return 0; } static int msdosfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; mode_t file_mode, mode = ap->a_mode; file_mode = (S_IXUSR|S_IXGRP|S_IXOTH) | (S_IRUSR|S_IRGRP|S_IROTH) | ((dep->de_Attributes & ATTR_READONLY) ? 0 : (S_IWUSR|S_IWGRP|S_IWOTH)); file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask); /* * Disallow writing to directories and regular files if the * filesystem is read-only. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid, ap->a_mode, ap->a_cred, NULL)); } static int msdosfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; mode_t mode; struct timespec ts; u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); uint64_t fileid; getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); vap->va_fsid = dev2udev(dep->de_dev); /* * The following computation of the fileid must be the same as that * used in msdosfs_readdir() to compute d_fileno. If not, pwd * doesn't work. */ if (dep->de_Attributes & ATTR_DIRECTORY) { fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) * dirsperblk; if (dep->de_StartCluster == MSDOSFSROOT) fileid = 1; } else { fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) * dirsperblk; if (dep->de_dirclust == MSDOSFSROOT) fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk; fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry); } if (pmp->pm_flags & MSDOSFS_LARGEFS) vap->va_fileid = msdosfs_fileno_map(pmp->pm_mountp, fileid); else vap->va_fileid = (long)fileid; if ((dep->de_Attributes & ATTR_READONLY) == 0) mode = S_IRWXU|S_IRWXG|S_IRWXO; else mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_mode = mode & (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask); vap->va_uid = pmp->pm_uid; vap->va_gid = pmp->pm_gid; vap->va_nlink = 1; vap->va_rdev = 0; vap->va_size = dep->de_FileSize; fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime); vap->va_ctime = vap->va_mtime; if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) { fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime); fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun, 0, &vap->va_birthtime); } else { vap->va_atime = vap->va_mtime; vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; } vap->va_flags = 0; if ((dep->de_Attributes & ATTR_ARCHIVE) == 0) vap->va_flags |= SF_ARCHIVED; vap->va_gen = 0; vap->va_blocksize = pmp->pm_bpcluster; vap->va_bytes = (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask; vap->va_type = ap->a_vp->v_type; vap->va_filerev = dep->de_modrev; return (0); } static int msdosfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): vp %p, vap %p, cred %p, p %p\n", ap->a_vp, vap, cred, ap->a_td); #endif /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): returning EINVAL\n"); printf(" va_type %d, va_nlink %x, va_fsid %lx, va_fileid %lx\n", vap->va_type, vap->va_nlink, vap->va_fsid, vap->va_fileid); printf(" va_blocksize %lx, va_rdev %x, va_bytes %qx, va_gen %lx\n", vap->va_blocksize, vap->va_rdev, vap->va_bytes, vap->va_gen); printf(" va_uid %x, va_gid %x\n", vap->va_uid, vap->va_gid); #endif return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); if (error) return (error); } /* * We are very inconsistent about handling unsupported * attributes. We ignored the access time and the * read and execute bits. We were strict for the other * attributes. * * Here we are strict, stricter than ufs in not allowing * users to attempt to set SF_SETTABLE bits or anyone to * set unsupported bits. However, we ignore attempts to * set ATTR_ARCHIVE for directories `cp -pr' from a more * sensible filesystem attempts it a lot. */ if (vap->va_flags & SF_SETTABLE) { error = priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0); if (error) return (error); } if (vap->va_flags & ~SF_ARCHIVED) return EOPNOTSUPP; if (vap->va_flags & SF_ARCHIVED) dep->de_Attributes &= ~ATTR_ARCHIVE; else if (!(dep->de_Attributes & ATTR_DIRECTORY)) dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { uid_t uid; gid_t gid; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); uid = vap->va_uid; if (uid == (uid_t)VNOVAL) uid = pmp->pm_uid; gid = vap->va_gid; if (gid == (gid_t)VNOVAL) gid = pmp->pm_gid; if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid || (gid != pmp->pm_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0); if (error) return (error); } if (uid != pmp->pm_uid || gid != pmp->pm_gid) return EINVAL; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: /* * Truncation is only supported for regular files, * Disallow it if the filesystem is read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support any file types except regular * files and directories in this file system, so * this (default) case is unreachable and can do * anything. Keep falling through to detrunc() * for now. */ break; } error = detrunc(dep, vap->va_size, 0, cred, ap->a_td); if (error) return error; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (vap->va_vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, cred, ap->a_td); if (error) error = VOP_ACCESS(vp, VWRITE, cred, ap->a_td); } else error = VOP_ACCESS(vp, VADMIN, cred, ap->a_td); if (vp->v_type != VDIR) { if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 && vap->va_atime.tv_sec != VNOVAL) { dep->de_flag &= ~DE_ACCESS; timespec2fattime(&vap->va_atime, 0, &dep->de_ADate, NULL, NULL); } if (vap->va_mtime.tv_sec != VNOVAL) { dep->de_flag &= ~DE_UPDATE; timespec2fattime(&vap->va_mtime, 0, &dep->de_MDate, &dep->de_MTime, NULL); } dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } } /* * DOS files only have the ability to have their writability * attribute set, so we use the owner write bit to set the readonly * attribute. */ if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); if (error) return (error); } if (vp->v_type != VDIR) { /* We ignore the read and execute bits. */ if (vap->va_mode & VWRITE) dep->de_Attributes &= ~ATTR_READONLY; else dep->de_Attributes |= ATTR_READONLY; dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } } return (deupdat(dep, 0)); } static int msdosfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error = 0; int blsize; int isadir; int orig_resid; u_int n; u_long diff; u_long on; daddr_t lbn; daddr_t rablock; int rasize; int seqcount; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct uio *uio = ap->a_uio; /* * If they didn't ask for any data, then we are done. */ orig_resid = uio->uio_resid; if (orig_resid == 0) return (0); /* * The caller is supposed to ensure that * uio->uio_offset >= 0 and uio->uio_resid >= 0. * We don't need to check for large offsets as in ffs because * dep->de_FileSize <= DOS_FILESIZE_MAX < OFF_MAX, so large * offsets cannot cause overflow even in theory. */ seqcount = ap->a_ioflag >> IO_SEQSHIFT; isadir = dep->de_Attributes & ATTR_DIRECTORY; do { if (uio->uio_offset >= dep->de_FileSize) break; lbn = de_cluster(pmp, uio->uio_offset); rablock = lbn + 1; blsize = pmp->pm_bpcluster; on = uio->uio_offset & pmp->pm_crbomask; /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ if (isadir) { /* convert cluster # to block # */ error = pcbmap(dep, lbn, &lbn, 0, &blsize); if (error == E2BIG) { error = EINVAL; break; } else if (error) break; error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); } else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) { error = bread(vp, lbn, blsize, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, dep->de_FileSize, lbn, blsize, NOCRED, on + uio->uio_resid, seqcount, &bp); } else if (seqcount > 1) { rasize = blsize; error = breadn(vp, lbn, blsize, &rablock, &rasize, 1, NOCRED, &bp); } else { error = bread(vp, lbn, blsize, NOCRED, &bp); } if (error) { brelse(bp); break; } diff = pmp->pm_bpcluster - on; n = diff > uio->uio_resid ? uio->uio_resid : diff; diff = dep->de_FileSize - uio->uio_offset; if (diff < n) n = diff; diff = blsize - bp->b_resid; if (diff < n) n = diff; error = uiomove(bp->b_data + on, (int) n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); if (!isadir && (error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) dep->de_flag |= DE_ACCESS; return (error); } /* * Write data to a file or directory. */ static int msdosfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int n; int croffset; int resid; u_long osize; int error = 0; u_long count; int seqcount; daddr_t bn, lastcn; struct buf *bp; int ioflag = ap->a_ioflag; struct uio *uio = ap->a_uio; struct thread *td = uio->uio_td; struct vnode *vp = ap->a_vp; struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; #ifdef MSDOSFS_DEBUG printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n", vp, uio, ioflag, cred); printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n", dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; thisvp = vp; break; case VDIR: return EISDIR; default: panic("msdosfs_write(): bad file type"); } /* * This is needed (unlike in ffs_write()) because we extend the * file outside of the loop but we don't want to extend the file * for writes of 0 bytes. */ if (uio->uio_resid == 0) return (0); /* * The caller is supposed to ensure that * uio->uio_offset >= 0 and uio->uio_resid >= 0. */ if ((uoff_t)uio->uio_offset + uio->uio_resid > DOS_FILESIZE_MAX) return (EFBIG); /* * If they've exceeded their filesize limit, tell them about it. */ if (td != NULL) { PROC_LOCK(td->td_proc); if ((uoff_t)uio->uio_offset + uio->uio_resid > lim_cur(td->td_proc, RLIMIT_FSIZE)) { psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); } /* * If the offset we are starting the write at is beyond the end of * the file, then they've done a seek. Unix filesystems allow * files with holes in them, DOS doesn't so we must fill the hole * with zeroed blocks. */ if (uio->uio_offset > dep->de_FileSize) { error = deextend(dep, uio->uio_offset, cred); if (error) return (error); } /* * Remember some values in case the write fails. */ resid = uio->uio_resid; osize = dep->de_FileSize; /* * If we write beyond the end of the file, extend it to its ultimate * size ahead of the time to hopefully get a contiguous area. */ if (uio->uio_offset + resid > osize) { count = de_clcount(pmp, uio->uio_offset + resid) - de_clcount(pmp, osize); error = extendfile(dep, count, NULL, NULL, 0); if (error && (error != ENOSPC || (ioflag & IO_UNIT))) goto errexit; lastcn = dep->de_fc[FC_LASTFC].fc_frcn; } else lastcn = de_clcount(pmp, osize) - 1; seqcount = ioflag >> IO_SEQSHIFT; do { if (de_cluster(pmp, uio->uio_offset) > lastcn) { error = ENOSPC; break; } croffset = uio->uio_offset & pmp->pm_crbomask; n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); if (uio->uio_offset + n > dep->de_FileSize) { dep->de_FileSize = uio->uio_offset + n; /* The object size needs to be set before buffer is allocated */ vnode_pager_setsize(vp, dep->de_FileSize); } bn = de_cluster(pmp, uio->uio_offset); if ((uio->uio_offset & pmp->pm_crbomask) == 0 && (de_cluster(pmp, uio->uio_offset + uio->uio_resid) > de_cluster(pmp, uio->uio_offset) || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { /* * If either the whole cluster gets written, * or we write the cluster from its start beyond EOF, * then no need to read data from disk. */ bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0); vfs_bio_clrbuf(bp); /* * Do the bmap now, since pcbmap needs buffers * for the fat table. (see msdosfs_strategy) */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0); if (error) bp->b_blkno = -1; else bp->b_blkno = bn; } if (bp->b_blkno == -1) { brelse(bp); if (!error) error = EIO; /* XXX */ break; } } else { /* * The block we need to write into exists, so read it in. */ error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp); if (error) { brelse(bp); break; } } /* * Should these vnode_pager_* functions be done on dir * files? */ /* * Copy the data from user space into the buf header. */ error = uiomove(bp->b_data + croffset, n, uio); if (error) { brelse(bp); break; } /* Prepare for clustered writes in some else clauses. */ if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) bp->b_flags |= B_CLUSTEROK; /* * If IO_SYNC, then each buffer is written synchronously. * Otherwise, if we have a severe page deficiency then * write the buffer asynchronously. Otherwise, if on a * cluster boundary then write the buffer asynchronously, * combining it with contiguous clusters if permitted and * possible, since we don't expect more writes into this * buffer soon. Otherwise, do a delayed write because we * expect more writes into this buffer soon. */ if (ioflag & IO_SYNC) (void)bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) bawrite(bp); else if (n + croffset == pmp->pm_bpcluster) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) cluster_write(vp, bp, dep->de_FileSize, seqcount); else bawrite(bp); } else bdwrite(bp); dep->de_flag |= DE_UPDATE; } while (error == 0 && uio->uio_resid > 0); /* * If the write failed and they want us to, truncate the file back * to the size it was before the write was attempted. */ errexit: if (error) { if (ioflag & IO_UNIT) { detrunc(dep, osize, ioflag & IO_SYNC, NOCRED, NULL); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL); if (uio->uio_resid != resid) error = 0; } } else if (ioflag & IO_SYNC) error = deupdat(dep, 1); return (error); } /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int msdosfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (deupdat(VTODE(ap->a_vp), ap->a_waitfor == MNT_WAIT)); } static int msdosfs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); struct denode *ddep = VTODE(ap->a_dvp); int error; if (ap->a_vp->v_type == VDIR) error = EPERM; else error = removede(ddep, dep); #ifdef MSDOSFS_DEBUG printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount); #endif return (error); } /* * DOS filesystems don't know what links are. */ static int msdosfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { return (EOPNOTSUPP); } /* * Renames on files require moving the denode to a new hash queue since the * denode's location is used to compute which hash queue to put the file * in. Unless it is a rename in place. For example "mv a b". * * What follows is the basic algorithm: * * if (file move) { * if (dest file exists) { * remove dest file * } * if (dest and src in same directory) { * rewrite name in existing directory slot * } else { * write new entry in dest directory * update offset and dirclust in denode * move denode to new hash chain * clear old directory entry * } * } else { * directory move * if (dest directory exists) { * if (dest is not empty) { * return ENOTEMPTY * } * remove dest directory * } * if (dest and src in same directory) { * rewrite name in existing entry * } else { * be sure dest is not a child of src directory * write entry in dest directory * update "." and ".." in moved directory * clear old directory entry for moved directory * } * } * * On entry: * source's parent directory is unlocked * source file or directory is unlocked * destination's parent directory is locked * destination file or directory is locked if it exists * * On exit: * all denodes should be released */ static int msdosfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; struct denode *ip, *xp, *dp, *zp; u_char toname[11], oldname[11]; u_long from_diroffset, to_diroffset; u_char to_count; int doingdirectory = 0, newparent = 0; int error; u_long cn; daddr_t bn; struct denode *fddep; /* from file's parent directory */ struct msdosfsmount *pmp; struct direntry *dotdotp; struct buf *bp; fddep = VTODE(ap->a_fdvp); pmp = fddep->de_pmp; pmp = VFSTOMSDOSFS(fdvp->v_mount); #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("msdosfs_rename: no name"); #endif /* * Check for cross-device rename. */ if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } /* * If source and dest are the same, do nothing. */ if (tvp == fvp) { error = 0; goto abortit; } - error = vn_lock(fvp, LK_EXCLUSIVE, td); + error = vn_lock(fvp, LK_EXCLUSIVE); if (error) goto abortit; dp = VTODE(fdvp); ip = VTODE(fvp); /* * Be sure we are not renaming ".", "..", or an alias of ".". This * leads to a crippled directory tree. It's pretty tough to do a * "ls" or "pwd" with the "." directory entry missing, and "cd .." * doesn't work if the ".." entry is missing. */ if (ip->de_Attributes & ATTR_DIRECTORY) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) || (ip->de_flag & DE_RENAME)) { VOP_UNLOCK(fvp, 0, td); error = EINVAL; goto abortit; } ip->de_flag |= DE_RENAME; doingdirectory++; } /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; /* * Remember direntry place to use for destination */ to_diroffset = dp->de_fndoffset; to_count = dp->de_fndcnt; /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to doscheckpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp, 0, td); if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster) newparent = 1; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); /* * doscheckpath() vput()'s dp, * so we have to do a relookup afterwards */ error = doscheckpath(ip, dp); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost to startdir"); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; } if (xp != NULL) { /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (xp->de_Attributes & ATTR_DIRECTORY) { if (!dosdirempty(xp)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = removede(dp, xp); if (error) goto bad; vput(tvp); xp = NULL; } /* * Convert the filename in tcnp into a dos filename. We copy this * into the denode and directory entry for the destination * file/directory. */ error = uniqdosname(VTODE(tdvp), tcnp, toname); if (error) goto abortit; /* * Since from wasn't locked at various places above, * have to do a relookup here. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost from startdir"); if (!newparent) VOP_UNLOCK(tdvp, 0, td); if (relookup(fdvp, &fvp, fcnp) == 0) vrele(fdvp); if (fvp == NULL) { /* * From name has disappeared. */ if (doingdirectory) panic("rename: lost dir entry"); if (newparent) VOP_UNLOCK(tdvp, 0, td); vrele(tdvp); vrele(ap->a_fvp); return 0; } xp = VTODE(fvp); zp = VTODE(fdvp); from_diroffset = zp->de_fndoffset; /* * Ensure that the directory entry still exists and has not * changed till now. If the source is a file the entry may * have been unlinked or renamed. In either case there is * no further work to be done. If the source is a directory * then it cannot have been rmdir'ed or renamed; this is * prohibited by the DE_RENAME flag. */ if (xp != ip) { if (doingdirectory) panic("rename: lost dir entry"); VOP_UNLOCK(fvp, 0, td); if (newparent) VOP_UNLOCK(fdvp, 0, td); vrele(ap->a_fvp); xp = NULL; } else { vrele(fvp); xp = NULL; /* * First write a new entry in the destination * directory and mark the entry in the source directory * as deleted. Then move the denode to the correct hash * chain for its new location in the filesystem. And, if * we moved a directory, then update its .. entry to point * to the new parent directory. */ bcopy(ip->de_Name, oldname, 11); bcopy(toname, ip->de_Name, 11); /* update denode */ dp->de_fndoffset = to_diroffset; dp->de_fndcnt = to_count; error = createde(ip, dp, (struct denode **)0, tcnp); if (error) { bcopy(oldname, ip->de_Name, 11); if (newparent) VOP_UNLOCK(fdvp, 0, td); VOP_UNLOCK(fvp, 0, td); goto bad; } ip->de_refcnt++; zp->de_fndoffset = from_diroffset; error = removede(zp, ip); if (error) { /* XXX should downgrade to ro here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0, td); VOP_UNLOCK(fvp, 0, td); goto bad; } if (!doingdirectory) { error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0, &ip->de_dirclust, 0); if (error) { /* XXX should downgrade to ro here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0, td); VOP_UNLOCK(fvp, 0, td); goto bad; } if (ip->de_dirclust == MSDOSFSROOT) ip->de_diroffset = to_diroffset; else ip->de_diroffset = to_diroffset & pmp->pm_crbomask; } reinsert(ip); if (newparent) VOP_UNLOCK(fdvp, 0, td); } /* * If we moved a directory to a new parent directory, then we must * fixup the ".." entry in the moved directory. */ if (doingdirectory && newparent) { cn = ip->de_StartCluster; if (cn == MSDOSFSROOT) { /* this should never happen */ panic("msdosfs_rename(): updating .. in root directory?"); } else bn = cntobn(pmp, cn); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); if (error) { /* XXX should downgrade to ro here, fs is corrupt */ brelse(bp); VOP_UNLOCK(fvp, 0, td); goto bad; } dotdotp = (struct direntry *)bp->b_data + 1; putushort(dotdotp->deStartCluster, dp->de_StartCluster); if (FAT32(pmp)) putushort(dotdotp->deHighClust, dp->de_StartCluster >> 16); if (fvp->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else if ((error = bwrite(bp)) != 0) { /* XXX should downgrade to ro here, fs is corrupt */ VOP_UNLOCK(fvp, 0, td); goto bad; } } VOP_UNLOCK(fvp, 0, td); bad: if (xp) vput(tvp); vput(tdvp); out: ip->de_flag &= ~DE_RENAME; vrele(fdvp); vrele(fvp); return (error); } static struct { struct direntry dot; struct direntry dotdot; } dosdirtemplate = { { ". ", " ", /* the . entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ }, { ".. ", " ", /* the .. entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ } }; static int msdosfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct direntry *denp; struct msdosfsmount *pmp = pdep->de_pmp; struct buf *bp; u_long newcluster, pcl; int bn; int error; struct denode ndirent; struct timespec ts; /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad2; } /* * Allocate a cluster to hold the about to be created directory. */ error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL); if (error) goto bad2; bzero(&ndirent, sizeof(ndirent)); ndirent.de_pmp = pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); /* * Now fill the cluster with the "." and ".." entries. And write * the cluster to disk. This way it is there for the parent * directory to be pointing at if there were a crash. */ bn = cntobn(pmp, newcluster); /* always succeeds */ bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0); bzero(bp->b_data, pmp->pm_bpcluster); bcopy(&dosdirtemplate, bp->b_data, sizeof dosdirtemplate); denp = (struct direntry *)bp->b_data; putushort(denp[0].deStartCluster, newcluster); putushort(denp[0].deCDate, ndirent.de_CDate); putushort(denp[0].deCTime, ndirent.de_CTime); denp[0].deCHundredth = ndirent.de_CHun; putushort(denp[0].deADate, ndirent.de_ADate); putushort(denp[0].deMDate, ndirent.de_MDate); putushort(denp[0].deMTime, ndirent.de_MTime); pcl = pdep->de_StartCluster; if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = 0; putushort(denp[1].deStartCluster, pcl); putushort(denp[1].deCDate, ndirent.de_CDate); putushort(denp[1].deCTime, ndirent.de_CTime); denp[1].deCHundredth = ndirent.de_CHun; putushort(denp[1].deADate, ndirent.de_ADate); putushort(denp[1].deMDate, ndirent.de_MDate); putushort(denp[1].deMTime, ndirent.de_MTime); if (FAT32(pmp)) { putushort(denp[0].deHighClust, newcluster >> 16); putushort(denp[1].deHighClust, pdep->de_StartCluster >> 16); } if (ap->a_dvp->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else if ((error = bwrite(bp)) != 0) goto bad; /* * Now build up a directory entry pointing to the newly allocated * cluster. This will be written to an empty slot in the parent * directory. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_mkdir: no name"); #endif error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_DIRECTORY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = newcluster; ndirent.de_FileSize = 0; ndirent.de_dev = pdep->de_dev; error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; *ap->a_vpp = DETOV(dep); return (0); bad: clusterfree(pmp, newcluster, NULL); bad2: return (error); } static int msdosfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct denode *ip, *dp; struct thread *td = cnp->cn_thread; int error; ip = VTODE(vp); dp = VTODE(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) { error = ENOTEMPTY; goto out; } /* * Delete the entry from the directory. For dos filesystems this * gets rid of the directory entry on disk, the in memory copy * still exists but the de_refcnt is <= 0. This prevents it from * being found by deget(). When the vput() on dep is done we give * up access and eventually msdosfs_reclaim() will be called which * will remove it from the denode cache. */ error = removede(dp, ip); if (error) goto out; /* * This is where we decrement the link count in the parent * directory. Since dos filesystems don't do this we just purge * the name cache. */ cache_purge(dvp); VOP_UNLOCK(dvp, 0, td); /* * Truncate the directory that is being deleted. */ error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred, td); cache_purge(vp); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); out: return (error); } /* * DOS filesystems don't know what symlinks are. */ static int msdosfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { return (EOPNOTSUPP); } static int msdosfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct mbnambuf nb; int error = 0; int diff; long n; int blsize; long on; u_long cn; uint64_t fileno; u_long dirsperblk; long bias = 0; daddr_t bn, lbn; struct buf *bp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; struct dirent dirbuf; struct uio *uio = ap->a_uio; u_long *cookies = NULL; int ncookies = 0; off_t offset, off; int chksum = -1; #ifdef MSDOSFS_DEBUG printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n", ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the the filesystem vnode, and hence can * retrieve the wrong block from the buffer cache for a plain file. * So, fail attempts to readdir() on a plain file. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) return (ENOTDIR); /* * To be safe, initialize dirbuf */ bzero(dirbuf.d_name, sizeof(dirbuf.d_name)); /* * If the user buffer is smaller than the size of one dos directory * entry or the file offset is not a multiple of the size of a * directory entry, then we fail the read. */ off = offset = uio->uio_offset; if (uio->uio_resid < sizeof(struct direntry) || (offset & (sizeof(struct direntry) - 1))) return (EINVAL); if (ap->a_ncookies) { ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); /* * If they are reading from the root directory then, we simulate * the . and .. entries since these don't exist in the root * directory. We also set the offset bias to make up for having to * simulate these entries. By this I mean that at file offset 64 we * read the first entry in the root directory that lives on disk. */ if (dep->de_StartCluster == MSDOSFSROOT || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) { #if 0 printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n", offset); #endif bias = 2 * sizeof(struct direntry); if (offset < bias) { for (n = (int)offset / sizeof(struct direntry); n < 2; n++) { if (FAT32(pmp)) fileno = (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else fileno = 1; if (pmp->pm_flags & MSDOSFS_LARGEFS) { dirbuf.d_fileno = msdosfs_fileno_map(pmp->pm_mountp, fileno); } else { dirbuf.d_fileno = (uint32_t)fileno; } dirbuf.d_type = DT_DIR; switch (n) { case 0: dirbuf.d_namlen = 1; strcpy(dirbuf.d_name, "."); break; case 1: dirbuf.d_namlen = 2; strcpy(dirbuf.d_name, ".."); break; } dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) goto out; error = uiomove(&dirbuf, dirbuf.d_reclen, uio); if (error) goto out; offset += sizeof(struct direntry); off = offset; if (cookies) { *cookies++ = offset; if (--ncookies <= 0) goto out; } } } } mbnambuf_init(&nb); off = offset; while (uio->uio_resid > 0) { lbn = de_cluster(pmp, offset - bias); on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); if (diff <= 0) break; n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) break; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, blsize - bp->b_resid); if (n == 0) { brelse(bp); return (EIO); } /* * Convert from dos directory entries to fs-independent * directory entries. */ for (dentp = (struct direntry *)(bp->b_data + on); (char *)dentp < bp->b_data + on + n; dentp++, offset += sizeof(struct direntry)) { #if 0 printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n", dentp, prev, crnt, dentp->deName[0], dentp->deAttributes); #endif /* * If this is an unused entry, we can stop. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); goto out; } /* * Skip deleted entries. */ if (dentp->deName[0] == SLOT_DELETED) { chksum = -1; mbnambuf_init(&nb); continue; } /* * Handle Win95 long directory entries */ if (dentp->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn(&nb, (struct winentry *)dentp, chksum, pmp); continue; } /* * Skip volume labels */ if (dentp->deAttributes & ATTR_VOLUME) { chksum = -1; mbnambuf_init(&nb); continue; } /* * This computation of d_fileno must match * the computation of va_fileid in * msdosfs_getattr. */ if (dentp->deAttributes & ATTR_DIRECTORY) { fileno = getushort(dentp->deStartCluster); if (FAT32(pmp)) fileno |= getushort(dentp->deHighClust) << 16; /* if this is the root directory */ if (fileno == MSDOSFSROOT) if (FAT32(pmp)) fileno = (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else fileno = 1; else fileno = (uint64_t)cntobn(pmp, fileno) * dirsperblk; dirbuf.d_type = DT_DIR; } else { fileno = (uoff_t)offset / sizeof(struct direntry); dirbuf.d_type = DT_REG; } if (pmp->pm_flags & MSDOSFS_LARGEFS) { dirbuf.d_fileno = msdosfs_fileno_map(pmp->pm_mountp, fileno); } else dirbuf.d_fileno = (uint32_t)fileno; if (chksum != winChksum(dentp)) { dirbuf.d_namlen = dos2unixfn(dentp->deName, (u_char *)dirbuf.d_name, dentp->deLowerCase | ((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ? (LCASE_BASE | LCASE_EXT) : 0), pmp); mbnambuf_init(&nb); } else mbnambuf_flush(&nb, &dirbuf); chksum = -1; dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) { brelse(bp); goto out; } error = uiomove(&dirbuf, dirbuf.d_reclen, uio); if (error) { brelse(bp); goto out; } if (cookies) { *cookies++ = offset + sizeof(struct direntry); if (--ncookies <= 0) { brelse(bp); goto out; } } off = offset + sizeof(struct direntry); } brelse(bp); } out: /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; uio->uio_offset = off; /* * Set the eofflag (NFS uses it) */ if (ap->a_eofflag) { if (dep->de_FileSize - (offset - bias) <= 0) *ap->a_eofflag = 1; else *ap->a_eofflag = 0; } return (error); } /*- * a_vp - pointer to the file's vnode * a_bn - logical block number within the file (cluster number for us) * a_bop - where to return the bufobj of the special file containing the fs * a_bnp - where to return the "physical" block number corresponding to a_bn * (relative to the special file; units are blocks of size DEV_BSIZE) * a_runp - where to return the "run past" a_bn. This is the count of logical * blocks whose physical blocks (together with a_bn's physical block) * are contiguous. * a_runb - where to return the "run before" a_bn. */ static int msdosfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { struct denode *dep; struct mount *mp; struct msdosfsmount *pmp; struct vnode *vp; daddr_t runbn; u_long cn; int bnpercn, error, maxio, maxrun, run; vp = ap->a_vp; dep = VTODE(vp); pmp = dep->de_pmp; if (ap->a_bop != NULL) *ap->a_bop = &pmp->pm_devvp->v_bufobj; if (ap->a_bnp == NULL) return (0); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; cn = ap->a_bn; if (cn != ap->a_bn) return (EFBIG); error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL); if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL)) return (error); mp = vp->v_mount; maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize; bnpercn = de_cn2bn(pmp, 1); if (ap->a_runp != NULL) { maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn); for (run = 1; run <= maxrun; run++) { if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 || runbn != *ap->a_bnp + run * bnpercn) break; } *ap->a_runp = run - 1; } if (ap->a_runb != NULL) { maxrun = ulmin(maxio - 1, cn); for (run = 1; run < maxrun; run++) { if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 || runbn != *ap->a_bnp - run * bnpercn) break; } *ap->a_runb = run - 1; } return (0); } static int msdosfs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct denode *dep = VTODE(ap->a_vp); struct bufobj *bo; int error = 0; daddr_t blkno; /* * If we don't already know the filesystem relative block number * then get it using pcbmap(). If pcbmap() returns the block * number as -1 then we've got a hole in the file. DOS filesystems * don't allow files with holes, so we shouldn't ever see this. */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if (bp->b_blkno == -1) { bufdone(bp); return (0); } /* * Read/write the block from/to the disk that contains the desired * file block. */ bp->b_iooffset = dbtob(bp->b_blkno); bo = dep->de_pmp->pm_bo; BO_STRATEGY(bo, bp); return (0); } static int msdosfs_print(ap) struct vop_print_args /* { struct vnode *vp; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ", dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset); printf("on dev %s\n", devtoname(dep->de_dev)); return (0); } static int msdosfs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } static int msdosfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; u_char a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); return (lf_advlock(ap, &dep->de_lockf, dep->de_FileSize)); } static int msdosfs_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; struct fid *a_fhp; } */ *ap; { struct denode *dep; struct defid *defhp; dep = VTODE(ap->a_vp); defhp = (struct defid *)ap->a_fhp; defhp->defid_len = sizeof(struct defid); defhp->defid_dirclust = dep->de_dirclust; defhp->defid_dirofs = dep->de_diroffset; /* defhp->defid_gen = dep->de_gen; */ return (0); } /* Global vfs data structures for msdosfs */ struct vop_vector msdosfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = msdosfs_access, .vop_advlock = msdosfs_advlock, .vop_bmap = msdosfs_bmap, .vop_cachedlookup = msdosfs_lookup, .vop_open = msdosfs_open, .vop_close = msdosfs_close, .vop_create = msdosfs_create, .vop_fsync = msdosfs_fsync, .vop_getattr = msdosfs_getattr, .vop_inactive = msdosfs_inactive, .vop_link = msdosfs_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = msdosfs_mkdir, .vop_mknod = msdosfs_mknod, .vop_pathconf = msdosfs_pathconf, .vop_print = msdosfs_print, .vop_read = msdosfs_read, .vop_readdir = msdosfs_readdir, .vop_reclaim = msdosfs_reclaim, .vop_remove = msdosfs_remove, .vop_rename = msdosfs_rename, .vop_rmdir = msdosfs_rmdir, .vop_setattr = msdosfs_setattr, .vop_strategy = msdosfs_strategy, .vop_symlink = msdosfs_symlink, .vop_write = msdosfs_write, .vop_vptofh = msdosfs_vptofh, }; Index: head/sys/fs/ntfs/ntfs_vfsops.c =================================================================== --- head/sys/fs/ntfs/ntfs_vfsops.c (revision 175201) +++ head/sys/fs/ntfs/ntfs_vfsops.c (revision 175202) @@ -1,795 +1,795 @@ /* $NetBSD: ntfs_vfsops.c,v 1.23 1999/11/15 19:38:14 jdolecek Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#define NTFS_DEBUG 1*/ #include #include #include #include #include #include static MALLOC_DEFINE(M_NTFSMNT, "ntfs_mount", "NTFS mount structure"); MALLOC_DEFINE(M_NTFSNTNODE,"ntfs_ntnode", "NTFS ntnode information"); MALLOC_DEFINE(M_NTFSFNODE,"ntfs_fnode", "NTFS fnode information"); MALLOC_DEFINE(M_NTFSDIR,"ntfs_dir", "NTFS dir buffer"); struct sockaddr; static int ntfs_mountfs(register struct vnode *, struct mount *, struct thread *); static int ntfs_calccfree(struct ntfsmount *ntmp, cn_t *cfreep); static vfs_init_t ntfs_init; static vfs_uninit_t ntfs_uninit; static vfs_vget_t ntfs_vget; static vfs_fhtovp_t ntfs_fhtovp; static vfs_cmount_t ntfs_cmount; static vfs_mount_t ntfs_mount; static vfs_root_t ntfs_root; static vfs_statfs_t ntfs_statfs; static vfs_unmount_t ntfs_unmount; static b_strategy_t ntfs_bufstrategy; /* * Buffer operations for NTFS vnodes. * We punt on VOP_BMAP, so we need to do * strategy on the file's vnode rather * than the underlying device's */ static struct buf_ops ntfs_vnbufops = { .bop_name = "NTFS", .bop_strategy = ntfs_bufstrategy, }; static int ntfs_init ( struct vfsconf *vcp ) { ntfs_nthashinit(); ntfs_toupper_init(); return 0; } static int ntfs_uninit ( struct vfsconf *vcp ) { ntfs_toupper_destroy(); ntfs_nthashdestroy(); return 0; } static int ntfs_cmount ( struct mntarg *ma, void *data, int flags, struct thread *td ) { int error; struct ntfs_args args; error = copyin(data, (caddr_t)&args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mode", "%d", args.mode); ma = mount_argb(ma, args.flag & NTFS_MFLAG_CASEINS, "nocaseins"); ma = mount_argb(ma, args.flag & NTFS_MFLAG_ALLNAMES, "noallnames"); if (args.flag & NTFS_MFLAG_KICONV) { ma = mount_argsu(ma, "cs_ntfs", args.cs_ntfs, 64); ma = mount_argsu(ma, "cs_local", args.cs_local, 64); } error = kernel_mount(ma, flags); return (error); } static const char *ntfs_opts[] = { "from", "export", "uid", "gid", "mode", "caseins", "allnames", "kiconv", "cs_ntfs", "cs_local", NULL }; static int ntfs_mount ( struct mount *mp, struct thread *td ) { int err = 0, error; struct vnode *devvp; struct nameidata ndp; char *from; if (vfs_filteropt(mp->mnt_optnew, ntfs_opts)) return (EINVAL); from = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); /* * If updating, check whether changing from read-only to * read/write. */ if (mp->mnt_flag & MNT_UPDATE) { if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) { /* Process export requests in vfs_mount.c */ goto success; } else { printf("ntfs_mount(): MNT_UPDATE not supported\n"); err = EINVAL; goto error_1; } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); err = namei(&ndp); if (err) { /* can't get devvp!*/ goto error_1; } NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &err)) { vput(devvp); return (err); } if (mp->mnt_flag & MNT_UPDATE) { #if 0 /* ******************** * UPDATE ******************** */ if (devvp != ntmp->um_devvp) err = EINVAL; /* needs translation */ vput(devvp); if (err) return (err); #endif } else { /* ******************** * NEW MOUNT ******************** */ /* * Since this is a new mount, we want the names for * the device and the mount point copied in. If an * error occurs, the mountpoint is discarded by the * upper level code. Note that vfs_mount() handles * copying the mountpoint f_mntonname for us, so we * don't have to do it here unless we want to set it * to something other than "path" for some rason. */ /* Save "mounted from" info for mount point (NULL pad)*/ vfs_mountedfrom(mp, from); err = ntfs_mountfs(devvp, mp, td); } if (err) { vrele(devvp); return (err); } goto success; error_1: /* no state to back out*/ /* XXX: missing NDFREE(&ndp, ...) */ success: return(err); } /* * Common code for mount and mountroot */ int ntfs_mountfs(devvp, mp, td) register struct vnode *devvp; struct mount *mp; struct thread *td; { struct buf *bp; struct ntfsmount *ntmp; struct cdev *dev = devvp->v_rdev; int error, ronly, i, v; struct vnode *vp; struct g_consumer *cp; struct g_provider *pp; char *cs_ntfs, *cs_local; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; DROP_GIANT(); g_topology_lock(); /* * XXX: Do not allow more than one consumer to open a device * associated with a particular GEOM provider. * This disables multiple read-only mounts of a device, * but it gets rid of panics in vget() when you try to * mount the same device more than once. */ pp = g_dev_getprovider(devvp->v_rdev); if ((pp != NULL) && ((pp->acr | pp->acw | pp->ace ) != 0)) error = EPERM; else error = g_vfs_open(devvp, &cp, "ntfs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); bp = NULL; error = bread(devvp, BBLOCK, BBSIZE, NOCRED, &bp); if (error) goto out; ntmp = malloc( sizeof *ntmp, M_NTFSMNT, M_WAITOK | M_ZERO); bcopy( bp->b_data, &ntmp->ntm_bootfile, sizeof(struct bootfile) ); /* * We must not cache the boot block if its size is not exactly * one cluster in order to avoid confusing the buffer cache when * the boot file is read later by ntfs_readntvattr_plain(), which * reads a cluster at a time. */ if (ntfs_cntob(1) != BBSIZE) bp->b_flags |= B_NOCACHE; brelse( bp ); bp = NULL; if (strncmp((const char *)ntmp->ntm_bootfile.bf_sysid, NTFS_BBID, NTFS_BBIDLEN)) { error = EINVAL; dprintf(("ntfs_mountfs: invalid boot block\n")); goto out; } { int8_t cpr = ntmp->ntm_mftrecsz; if( cpr > 0 ) ntmp->ntm_bpmftrec = ntmp->ntm_spc * cpr; else ntmp->ntm_bpmftrec = (1 << (-cpr)) / ntmp->ntm_bps; } dprintf(("ntfs_mountfs(): bps: %d, spc: %d, media: %x, mftrecsz: %d (%d sects)\n", ntmp->ntm_bps,ntmp->ntm_spc,ntmp->ntm_bootfile.bf_media, ntmp->ntm_mftrecsz,ntmp->ntm_bpmftrec)); dprintf(("ntfs_mountfs(): mftcn: 0x%x|0x%x\n", (u_int32_t)ntmp->ntm_mftcn,(u_int32_t)ntmp->ntm_mftmirrcn)); ntmp->ntm_mountp = mp; ntmp->ntm_devvp = devvp; if (1 == vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v)) ntmp->ntm_uid = v; if (1 == vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v)) ntmp->ntm_gid = v; if (1 == vfs_scanopt(mp->mnt_optnew, "mode", "%d", &v)) ntmp->ntm_mode = v & ACCESSPERMS; vfs_flagopt(mp->mnt_optnew, "caseins", &ntmp->ntm_flag, NTFS_MFLAG_CASEINS); vfs_flagopt(mp->mnt_optnew, "allnames", &ntmp->ntm_flag, NTFS_MFLAG_ALLNAMES); ntmp->ntm_cp = cp; ntmp->ntm_bo = &devvp->v_bufobj; cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error); if (error && error != ENOENT) goto out; cs_ntfs = vfs_getopts(mp->mnt_optnew, "cs_ntfs", &error); if (error && error != ENOENT) goto out; /* Copy in the 8-bit to Unicode conversion table */ /* Initialize Unicode to 8-bit table from 8toU table */ ntfs_82u_init(ntmp, cs_local, cs_ntfs); if (cs_local != NULL && cs_ntfs != NULL) ntfs_u28_init(ntmp, NULL, cs_local, cs_ntfs); else ntfs_u28_init(ntmp, ntmp->ntm_82u, cs_local, cs_ntfs); mp->mnt_data = ntmp; dprintf(("ntfs_mountfs(): case-%s,%s uid: %d, gid: %d, mode: %o\n", (ntmp->ntm_flag & NTFS_MFLAG_CASEINS)?"insens.":"sens.", (ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES)?" allnames,":"", ntmp->ntm_uid, ntmp->ntm_gid, ntmp->ntm_mode)); /* * We read in some system nodes to do not allow * reclaim them and to have everytime access to them. */ { int pi[3] = { NTFS_MFTINO, NTFS_ROOTINO, NTFS_BITMAPINO }; for (i=0; i<3; i++) { error = VFS_VGET(mp, pi[i], LK_EXCLUSIVE, &(ntmp->ntm_sysvn[pi[i]])); if(error) goto out1; ntmp->ntm_sysvn[pi[i]]->v_vflag |= VV_SYSTEM; VREF(ntmp->ntm_sysvn[pi[i]]); vput(ntmp->ntm_sysvn[pi[i]]); } } /* read the Unicode lowercase --> uppercase translation table, * if necessary */ if ((error = ntfs_toupper_use(mp, ntmp))) goto out1; /* * Scan $BitMap and count free clusters */ error = ntfs_calccfree(ntmp, &ntmp->ntm_cfree); if(error) goto out1; /* * Read and translate to internal format attribute * definition file. */ { int num,j; struct attrdef ad; /* Open $AttrDef */ error = VFS_VGET(mp, NTFS_ATTRDEFINO, LK_EXCLUSIVE, &vp ); if(error) goto out1; /* Count valid entries */ for(num=0;;num++) { error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, num * sizeof(ad), sizeof(ad), &ad, NULL); if (error) goto out1; if (ad.ad_name[0] == 0) break; } /* Alloc memory for attribute definitions */ MALLOC(ntmp->ntm_ad, struct ntvattrdef *, num * sizeof(struct ntvattrdef), M_NTFSMNT, M_WAITOK); ntmp->ntm_adnum = num; /* Read them and translate */ for(i=0;intm_ad[i].ad_name[j] = ad.ad_name[j]; } while(ad.ad_name[j++]); ntmp->ntm_ad[i].ad_namelen = j - 1; ntmp->ntm_ad[i].ad_type = ad.ad_type; } vput(vp); } mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = 0; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); return (0); out1: for(i=0;intm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); if (vflush(mp, 0, 0, td)) dprintf(("ntfs_mountfs: vflush failed\n")); out: if (bp) brelse(bp); DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); return (error); } static int ntfs_unmount( struct mount *mp, int mntflags, struct thread *td) { struct ntfsmount *ntmp; int error, flags, i; dprintf(("ntfs_unmount: unmounting...\n")); ntmp = VFSTONTFS(mp); flags = 0; if(mntflags & MNT_FORCE) flags |= FORCECLOSE; dprintf(("ntfs_unmount: vflushing...\n")); error = vflush(mp, 0, flags | SKIPSYSTEM, td); if (error) { printf("ntfs_unmount: vflush failed: %d\n",error); return (error); } /* Check if only system vnodes are rest */ for(i=0;intm_sysvn[i]) && (vrefcnt(ntmp->ntm_sysvn[i]) > 1)) return (EBUSY); /* Dereference all system vnodes */ for(i=0;intm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); /* vflush system vnodes */ error = vflush(mp, 0, flags, td); if (error) printf("ntfs_unmount: vflush failed(sysnodes): %d\n",error); vinvalbuf(ntmp->ntm_devvp, V_SAVE, td, 0, 0); DROP_GIANT(); g_topology_lock(); g_vfs_close(ntmp->ntm_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ntmp->ntm_devvp); /* free the toupper table, if this has been last mounted ntfs volume */ ntfs_toupper_unuse(); dprintf(("ntfs_umount: freeing memory...\n")); ntfs_u28_uninit(ntmp); ntfs_82u_uninit(ntmp); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); FREE(ntmp->ntm_ad, M_NTFSMNT); FREE(ntmp, M_NTFSMNT); return (error); } static int ntfs_root( struct mount *mp, int flags, struct vnode **vpp, struct thread *td ) { struct vnode *nvp; int error = 0; dprintf(("ntfs_root(): sysvn: %p\n", VFSTONTFS(mp)->ntm_sysvn[NTFS_ROOTINO])); error = VFS_VGET(mp, (ino_t)NTFS_ROOTINO, LK_EXCLUSIVE, &nvp); if(error) { printf("ntfs_root: VFS_VGET failed: %d\n",error); return (error); } *vpp = nvp; return (0); } static int ntfs_calccfree( struct ntfsmount *ntmp, cn_t *cfreep) { struct vnode *vp; u_int8_t *tmp; int j, error; long cfree = 0; size_t bmsize, i; vp = ntmp->ntm_sysvn[NTFS_BITMAPINO]; bmsize = VTOF(vp)->f_size; MALLOC(tmp, u_int8_t *, bmsize, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, 0, bmsize, tmp, NULL); if (error) goto out; for(i=0;intm_sysvn[NTFS_MFTINO])->f_size; mftallocated = VTOF(ntmp->ntm_sysvn[NTFS_MFTINO])->f_allocated; sbp->f_type = mp->mnt_vfc->vfc_typenum; sbp->f_bsize = ntmp->ntm_bps; sbp->f_iosize = ntmp->ntm_bps * ntmp->ntm_spc; sbp->f_blocks = ntmp->ntm_bootfile.bf_spv; sbp->f_bfree = sbp->f_bavail = ntfs_cntobn(ntmp->ntm_cfree); sbp->f_ffree = sbp->f_bfree / ntmp->ntm_bpmftrec; sbp->f_files = mftallocated / ntfs_bntob(ntmp->ntm_bpmftrec) + sbp->f_ffree; sbp->f_flags = mp->mnt_flag; return (0); } /*ARGSUSED*/ static int ntfs_fhtovp( struct mount *mp, struct fid *fhp, struct vnode **vpp) { struct vnode *nvp; struct ntfid *ntfhp = (struct ntfid *)fhp; int error; ddprintf(("ntfs_fhtovp(): %d\n", ntfhp->ntfid_ino)); if ((error = VFS_VGET(mp, ntfhp->ntfid_ino, LK_EXCLUSIVE, &nvp)) != 0) { *vpp = NULLVP; return (error); } /* XXX as unlink/rmdir/mkdir/creat are not currently possible * with NTFS, we don't need to check anything else for now */ *vpp = nvp; vnode_create_vobject(nvp, VTOF(nvp)->f_size, curthread); return (0); } int ntfs_vgetex( struct mount *mp, ino_t ino, u_int32_t attrtype, char *attrname, u_long lkflags, u_long flags, struct thread *td, struct vnode **vpp) { int error; register struct ntfsmount *ntmp; struct ntnode *ip; struct fnode *fp; struct vnode *vp; enum vtype f_type; dprintf(("ntfs_vgetex: ino: %d, attr: 0x%x:%s, lkf: 0x%lx, f: 0x%lx\n", ino, attrtype, attrname?attrname:"", (u_long)lkflags, (u_long)flags)); ntmp = VFSTONTFS(mp); *vpp = NULL; /* Get ntnode */ error = ntfs_ntlookup(ntmp, ino, &ip); if (error) { printf("ntfs_vget: ntfs_ntget failed\n"); return (error); } /* It may be not initialized fully, so force load it */ if (!(flags & VG_DONTLOADIN) && !(ip->i_flag & IN_LOADED)) { error = ntfs_loadntnode(ntmp, ip); if(error) { printf("ntfs_vget: CAN'T LOAD ATTRIBUTES FOR INO: %d\n", ip->i_number); ntfs_ntput(ip); return (error); } } error = ntfs_fget(ntmp, ip, attrtype, attrname, &fp); if (error) { printf("ntfs_vget: ntfs_fget failed\n"); ntfs_ntput(ip); return (error); } f_type = VNON; if (!(flags & VG_DONTVALIDFN) && !(fp->f_flag & FN_VALID)) { if ((ip->i_frflag & NTFS_FRFLAG_DIR) && (fp->f_attrtype == NTFS_A_DATA && fp->f_attrname == NULL)) { f_type = VDIR; } else if (flags & VG_EXT) { f_type = VNON; fp->f_size = fp->f_allocated = 0; } else { f_type = VREG; error = ntfs_filesize(ntmp, fp, &fp->f_size, &fp->f_allocated); if (error) { ntfs_ntput(ip); return (error); } } fp->f_flag |= FN_VALID; } if (FTOV(fp)) { vget(FTOV(fp), lkflags, td); *vpp = FTOV(fp); ntfs_ntput(ip); return (0); } error = getnewvnode("ntfs", ntmp->ntm_mountp, &ntfs_vnodeops, &vp); if(error) { ntfs_frele(fp); ntfs_ntput(ip); return (error); } /* XXX: Too early for mpsafe fs, lacks vnode lock */ error = insmntque(vp, ntmp->ntm_mountp); if (error) { ntfs_frele(fp); ntfs_ntput(ip); return (error); } dprintf(("ntfs_vget: vnode: %p for ntnode: %d\n", vp,ino)); fp->f_vp = vp; vp->v_data = fp; vp->v_type = f_type; vp->v_bufobj.bo_ops = &ntfs_vnbufops; vp->v_bufobj.bo_private = vp; if (ino == NTFS_ROOTINO) vp->v_vflag |= VV_ROOT; ntfs_ntput(ip); if (lkflags & LK_TYPE_MASK) { - error = vn_lock(vp, lkflags, td); + error = vn_lock(vp, lkflags); if (error) { vput(vp); return (error); } } *vpp = vp; return (0); } static int ntfs_vget( struct mount *mp, ino_t ino, int lkflags, struct vnode **vpp) { return ntfs_vgetex(mp, ino, NTFS_A_DATA, NULL, lkflags, 0, curthread, vpp); } static void ntfs_bufstrategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int rc; vp = bo->bo_private; KASSERT(bo == &vp->v_bufobj, ("BO/VP mismatch: vp %p bo %p != %p", vp, &vp->v_bufobj, bo)); rc = VOP_STRATEGY(vp, bp); KASSERT(rc == 0, ("NTFS VOP_STRATEGY failed: bp=%p, " "vp=%p, rc=%d", bp, vp, rc)); } static struct vfsops ntfs_vfsops = { .vfs_fhtovp = ntfs_fhtovp, .vfs_init = ntfs_init, .vfs_cmount = ntfs_cmount, .vfs_mount = ntfs_mount, .vfs_root = ntfs_root, .vfs_statfs = ntfs_statfs, .vfs_uninit = ntfs_uninit, .vfs_unmount = ntfs_unmount, .vfs_vget = ntfs_vget, }; VFS_SET(ntfs_vfsops, ntfs, 0); MODULE_VERSION(ntfs, 1); Index: head/sys/fs/ntfs/ntfs_vnops.c =================================================================== --- head/sys/fs/ntfs/ntfs_vnops.c (revision 175201) +++ head/sys/fs/ntfs/ntfs_vnops.c (revision 175202) @@ -1,777 +1,777 @@ /* $NetBSD: ntfs_vnops.c,v 1.23 1999/10/31 19:45:27 jdolecek Exp $ */ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#define NTFS_DEBUG 1*/ #include #include #include #include /* for pathconf(2) constants */ static vop_read_t ntfs_read; static vop_write_t ntfs_write; static vop_getattr_t ntfs_getattr; static vop_inactive_t ntfs_inactive; static vop_reclaim_t ntfs_reclaim; static vop_bmap_t ntfs_bmap; static vop_strategy_t ntfs_strategy; static vop_access_t ntfs_access; static vop_open_t ntfs_open; static vop_close_t ntfs_close; static vop_readdir_t ntfs_readdir; static vop_cachedlookup_t ntfs_lookup; static vop_fsync_t ntfs_fsync; static vop_pathconf_t ntfs_pathconf; static vop_vptofh_t ntfs_vptofh; int ntfs_prtactive = 1; /* 1 => print out reclaim of active vnodes */ /* * This is a noop, simply returning what one has been given. */ int ntfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { struct vnode *vp = ap->a_vp; dprintf(("ntfs_bmap: vn: %p, blk: %d\n", ap->a_vp,(u_int32_t)ap->a_bn)); if (ap->a_bop != NULL) *ap->a_bop = &vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int ntfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; struct buf *bp; daddr_t cn; int resid, off, toread; int error; dprintf(("ntfs_read: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); dprintf(("ntfs_read: filesize: %d",(u_int32_t)fp->f_size)); /* don't allow reading after end of file */ if (uio->uio_offset > fp->f_size) return (0); resid = MIN(uio->uio_resid, fp->f_size - uio->uio_offset); dprintf((", resid: %d\n", resid)); error = 0; while (resid) { cn = ntfs_btocn(uio->uio_offset); off = ntfs_btocnoff(uio->uio_offset); toread = MIN(off + resid, ntfs_cntob(1)); error = bread(vp, cn, ntfs_cntob(1), NOCRED, &bp); if (error) { brelse(bp); break; } error = uiomove(bp->b_data + off, toread - off, uio); if(error) { brelse(bp); break; } brelse(bp); resid -= toread - off; } return (error); } static int ntfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); register struct vattr *vap = ap->a_vap; dprintf(("ntfs_getattr: %d, flags: %d\n",ip->i_number,ip->i_flag)); vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mp->ntm_mode; vap->va_nlink = (ip->i_nlink || ip->i_flag & IN_LOADED ? ip->i_nlink : 1); vap->va_uid = ip->i_mp->ntm_uid; vap->va_gid = ip->i_mp->ntm_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = fp->f_size; vap->va_bytes = fp->f_allocated; vap->va_atime = ntfs_nttimetounix(fp->f_times.t_access); vap->va_mtime = ntfs_nttimetounix(fp->f_times.t_write); vap->va_ctime = ntfs_nttimetounix(fp->f_times.t_create); vap->va_flags = ip->i_flag; vap->va_gen = 0; vap->va_blocksize = ip->i_mp->ntm_spc * ip->i_mp->ntm_bps; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Last reference to an ntnode. If necessary, write or delete it. */ int ntfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; #ifdef NTFS_DEBUG register struct ntnode *ip = VTONT(vp); #endif dprintf(("ntfs_inactive: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vrefcnt(vp) != 0) vprint("ntfs_inactive: pushing active", vp); /* XXX since we don't support any filesystem changes * right now, nothing more needs to be done */ return (0); } /* * Reclaim an fnode/ntnode so that it can be used for other purposes. */ int ntfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); int error; dprintf(("ntfs_reclaim: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vrefcnt(vp) != 0) vprint("ntfs_reclaim: pushing active", vp); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); if ((error = ntfs_ntget(ip)) != 0) return (error); /* Purge old data structures associated with the inode. */ ntfs_frele(fp); ntfs_ntput(ip); vp->v_data = NULL; return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ntfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct ntfsmount *ntmp = ip->i_mp; int error; dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_offset,(u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); dprintf(("strategy: bcount: %d flags: 0x%x\n", (u_int32_t)bp->b_bcount,bp->b_flags)); if (bp->b_iocmd == BIO_READ) { u_int32_t toread; if (ntfs_cntob(bp->b_blkno) >= fp->f_size) { clrbuf(bp); error = 0; } else { toread = MIN(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: toread: %d, fsize: %d\n", toread,(u_int32_t)fp->f_size)); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno), toread, bp->b_data, NULL); if (error) { printf("ntfs_strategy: ntfs_readattr failed\n"); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } bzero(bp->b_data + toread, bp->b_bcount - toread); } } else { size_t tmp; u_int32_t towrite; if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) { printf("ntfs_strategy: CAN'T EXTEND FILE\n"); bp->b_error = error = EFBIG; bp->b_ioflags |= BIO_ERROR; } else { towrite = MIN(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n", towrite,(u_int32_t)fp->f_size)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite, bp->b_data, &tmp, NULL); if (error) { printf("ntfs_strategy: ntfs_writeattr fail\n"); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } } } bufdone(bp); return (error); } static int ntfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int64_t towrite; size_t written; int error; dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); dprintf(("ntfs_write: filesize: %d",(u_int32_t)fp->f_size)); if (uio->uio_resid + uio->uio_offset > fp->f_size) { printf("ntfs_write: CAN'T WRITE BEYOND END OF FILE\n"); return (EFBIG); } towrite = MIN(uio->uio_resid, fp->f_size - uio->uio_offset); dprintf((", towrite: %d\n",(u_int32_t)towrite)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, towrite, NULL, &written, uio); #ifdef NTFS_DEBUG if (error) printf("ntfs_write: ntfs_writeattr failed: %d\n", error); #endif return (error); } int ntfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct ntnode *ip = VTONT(vp); mode_t mode = ap->a_mode; #ifdef QUOTA int error; #endif dprintf(("ntfs_access: %d\n",ip->i_number)); /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if (error = getinoquota(ip)) return (error); #endif break; } } return (vaccess(vp->v_type, ip->i_mp->ntm_mode, ip->i_mp->ntm_uid, ip->i_mp->ntm_gid, ap->a_mode, ap->a_cred, NULL)); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int ntfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { #ifdef NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_open: %d\n",ip->i_number); #endif vnode_create_vobject(ap->a_vp, VTOF(ap->a_vp)->f_size, ap->a_td); /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ntfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { #ifdef NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_close: %d\n",ip->i_number); #endif return (0); } int ntfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; int i, j, error = 0; wchar c; u_int32_t faked = 0, num; int ncookies = 0; struct dirent cde; off_t off; dprintf(("ntfs_readdir %d off: %d resid: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; /* Simulate . in every dir except ROOT */ if( ip->i_number != NTFS_ROOTINO ) { struct dirent dot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 1, "." }; if( uio->uio_offset < sizeof(struct dirent) ) { dot.d_fileno = ip->i_number; error = uiomove((char *)&dot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } } /* Simulate .. in every dir including ROOT */ if( uio->uio_offset < 2 * sizeof(struct dirent) ) { struct dirent dotdot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 2, ".." }; error = uiomove((char *)&dotdot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } faked = (ip->i_number == NTFS_ROOTINO) ? 1 : 2; num = uio->uio_offset / sizeof(struct dirent) - faked; while( uio->uio_resid >= sizeof(struct dirent) ) { struct attr_indexentry *iep; error = ntfs_ntreaddir(ntmp, fp, num, &iep); if(error) return (error); if( NULL == iep ) break; for(; !(iep->ie_flag & NTFS_IEFLAG_LAST) && (uio->uio_resid >= sizeof(struct dirent)); iep = NTFS_NEXTREC(iep, struct attr_indexentry *)) { if(!ntfs_isnamepermitted(ntmp,iep)) continue; for(i=0, j=0; iie_fnamelen; i++, j++) { c = NTFS_U28(iep->ie_fname[i]); if (c&0xFF00) cde.d_name[j++] = (char)(c>>8); cde.d_name[j] = (char)c&0xFF; } cde.d_name[j] = '\0'; dprintf(("ntfs_readdir: elem: %d, fname:[%s] type: %d, flag: %d, ", num, cde.d_name, iep->ie_fnametype, iep->ie_flag)); cde.d_namlen = j; cde.d_fileno = iep->ie_number; cde.d_type = (iep->ie_fflag & NTFS_FFLAG_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); dprintf(("%s\n", (cde.d_type == DT_DIR) ? "dir":"reg")); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if(error) return (error); ncookies++; num++; } } dprintf(("ntfs_readdir: %d entries (%d bytes) read\n", ncookies,(u_int)(uio->uio_offset - off))); dprintf(("ntfs_readdir: off: %d resid: %d\n", (u_int32_t)uio->uio_offset,uio->uio_resid)); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; u_long *cookies; u_long *cookiep; ddprintf(("ntfs_readdir: %d cookies\n",ncookies)); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ntfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } /* if (ap->a_eofflag) *ap->a_eofflag = VTONT(ap->a_vp)->i_size <= uio->uio_offset; */ return (error); } int ntfs_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct ntnode *dip = VTONT(dvp); struct ntfsmount *ntmp = dip->i_mp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; dprintf(("ntfs_lookup: \"%.*s\" (%ld bytes) in %d\n", (int)cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_namelen, dip->i_number)); error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_thread); if(error) return (error); if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { dprintf(("ntfs_lookup: faking . directory in %d\n", dip->i_number)); VREF(dvp); *ap->a_vpp = dvp; error = 0; } else if (cnp->cn_flags & ISDOTDOT) { struct ntvattr *vap; dprintf(("ntfs_lookup: faking .. directory in %d\n", dip->i_number)); error = ntfs_ntvattrget(ntmp, dip, NTFS_A_NAME, NULL, 0, &vap); if(error) return (error); VOP_UNLOCK(dvp,0,cnp->cn_thread); dprintf(("ntfs_lookup: parentdir: %d\n", vap->va_a_name->n_pnumber)); error = VFS_VGET(ntmp->ntm_mountp, vap->va_a_name->n_pnumber, LK_EXCLUSIVE, ap->a_vpp); ntfs_ntvattrrele(vap); if (error) { - vn_lock(dvp,LK_EXCLUSIVE|LK_RETRY,cnp->cn_thread); + vn_lock(dvp,LK_EXCLUSIVE|LK_RETRY); return (error); } } else { error = ntfs_ntlookupfile(ntmp, dvp, cnp, ap->a_vpp); if (error) { dprintf(("ntfs_ntlookupfile: returned %d\n", error)); return (error); } dprintf(("ntfs_lookup: found ino: %d\n", VTONT(*ap->a_vpp)->i_number)); } if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *ap->a_vpp, cnp); return (error); } /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int ntfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { return (0); } /* * Return POSIX pathconf information applicable to NTFS filesystem */ int ntfs_pathconf(ap) struct vop_pathconf_args *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = NTFS_MAXFILENAME; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } int ntfs_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; struct fid *a_fhp; } */ *ap; { register struct ntnode *ntp; register struct ntfid *ntfhp; ddprintf(("ntfs_fhtovp(): %p\n", ap->a_vp)); ntp = VTONT(ap->a_vp); ntfhp = (struct ntfid *)ap->a_fhp; ntfhp->ntfid_len = sizeof(struct ntfid); ntfhp->ntfid_ino = ntp->i_number; /* ntfhp->ntfid_gen = ntp->i_gen; */ return (0); } /* * Global vfs data structures */ struct vop_vector ntfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = ntfs_access, .vop_bmap = ntfs_bmap, .vop_cachedlookup = ntfs_lookup, .vop_close = ntfs_close, .vop_fsync = ntfs_fsync, .vop_getattr = ntfs_getattr, .vop_inactive = ntfs_inactive, .vop_lookup = vfs_cache_lookup, .vop_open = ntfs_open, .vop_pathconf = ntfs_pathconf, .vop_read = ntfs_read, .vop_readdir = ntfs_readdir, .vop_reclaim = ntfs_reclaim, .vop_strategy = ntfs_strategy, .vop_write = ntfs_write, .vop_vptofh = ntfs_vptofh, }; Index: head/sys/fs/nullfs/null_subr.c =================================================================== --- head/sys/fs/nullfs/null_subr.c (revision 175201) +++ head/sys/fs/nullfs/null_subr.c (revision 175202) @@ -1,349 +1,349 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_subr.c 8.7 (Berkeley) 5/14/95 * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #define LOG2_SIZEVNODE 8 /* log2(sizeof struct vnode) */ #define NNULLNODECACHE 16 /* * Null layer cache: * Each cache entry holds a reference to the lower vnode * along with a pointer to the alias vnode. When an * entry is added the lower vnode is VREF'd. When the * alias is removed the lower vnode is vrele'd. */ #define NULL_NHASH(vp) \ (&null_node_hashtbl[(((uintptr_t)vp)>>LOG2_SIZEVNODE) & null_node_hash]) static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; static u_long null_node_hash; struct mtx null_hashmtx; static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table"); MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part"); static struct vnode * null_hashget(struct mount *, struct vnode *); static struct vnode * null_hashins(struct mount *, struct null_node *); /* * Initialise cache headers */ int nullfs_init(vfsp) struct vfsconf *vfsp; { NULLFSDEBUG("nullfs_init\n"); /* printed during system boot */ null_node_hashtbl = hashinit(NNULLNODECACHE, M_NULLFSHASH, &null_node_hash); mtx_init(&null_hashmtx, "nullhs", NULL, MTX_DEF); return (0); } int nullfs_uninit(vfsp) struct vfsconf *vfsp; { mtx_destroy(&null_hashmtx); free(null_node_hashtbl, M_NULLFSHASH); return (0); } /* * Return a VREF'ed alias for lower vnode if already exists, else 0. * Lower vnode should be locked on entry and will be left locked on exit. */ static struct vnode * null_hashget(mp, lowervp) struct mount *mp; struct vnode *lowervp; { struct thread *td = curthread; /* XXX */ struct null_node_hashhead *hd; struct null_node *a; struct vnode *vp; int error; ASSERT_VOP_LOCKED(lowervp, "null_hashget"); /* * Find hash base, and then search the (two-way) linked * list looking for a null_node structure which is referencing * the lower vnode. If found, the increment the null_node * reference count (but NOT the lower vnode's VREF counter). */ hd = NULL_NHASH(lowervp); mtx_lock(&null_hashmtx); LIST_FOREACH(a, hd, null_hash) { if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { vp = NULLTOV(a); VI_LOCK(vp); mtx_unlock(&null_hashmtx); /* * We need to clear the OWEINACT flag here as this * may lead vget() to try to lock our vnode which * is already locked via lowervp. */ vp->v_iflag &= ~VI_OWEINACT; error = vget(vp, LK_INTERLOCK, td); /* * Since we have the lower node locked the nullfs * node can not be in the process of recycling. If * it had been recycled before we grabed the lower * lock it would not have been found on the hash. */ if (error) panic("null_hashget: vget error %d", error); return (vp); } } mtx_unlock(&null_hashmtx); return (NULLVP); } /* * Act like null_hashget, but add passed null_node to hash if no existing * node found. */ static struct vnode * null_hashins(mp, xp) struct mount *mp; struct null_node *xp; { struct thread *td = curthread; /* XXX */ struct null_node_hashhead *hd; struct null_node *oxp; struct vnode *ovp; int error; hd = NULL_NHASH(xp->null_lowervp); mtx_lock(&null_hashmtx); LIST_FOREACH(oxp, hd, null_hash) { if (oxp->null_lowervp == xp->null_lowervp && NULLTOV(oxp)->v_mount == mp) { /* * See null_hashget for a description of this * operation. */ ovp = NULLTOV(oxp); VI_LOCK(ovp); mtx_unlock(&null_hashmtx); ovp->v_iflag &= ~VI_OWEINACT; error = vget(ovp, LK_INTERLOCK, td); if (error) panic("null_hashins: vget error %d", error); return (ovp); } } LIST_INSERT_HEAD(hd, xp, null_hash); mtx_unlock(&null_hashmtx); return (NULLVP); } static void null_insmntque_dtr(struct vnode *vp, void *xp) { vp->v_data = NULL; vp->v_vnlock = &vp->v_lock; FREE(xp, M_NULLFSNODE); vp->v_op = &dead_vnodeops; - (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } /* * Make a new or get existing nullfs node. * Vp is the alias vnode, lowervp is the lower vnode. * * The lowervp assumed to be locked and having "spare" reference. This routine * vrele lowervp if nullfs node was taken from hash. Otherwise it "transfers" * the caller's "spare" reference to created nullfs vnode. */ int null_nodeget(mp, lowervp, vpp) struct mount *mp; struct vnode *lowervp; struct vnode **vpp; { struct null_node *xp; struct vnode *vp; int error; /* Lookup the hash firstly */ *vpp = null_hashget(mp, lowervp); if (*vpp != NULL) { vrele(lowervp); return (0); } /* * We do not serialize vnode creation, instead we will check for * duplicates later, when adding new vnode to hash. * * Note that duplicate can only appear in hash if the lowervp is * locked LK_SHARED. */ /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ MALLOC(xp, struct null_node *, sizeof(struct null_node), M_NULLFSNODE, M_WAITOK); error = getnewvnode("null", mp, &null_vnodeops, &vp); if (error) { FREE(xp, M_NULLFSNODE); return (error); } xp->null_vnode = vp; xp->null_lowervp = lowervp; vp->v_type = lowervp->v_type; vp->v_data = xp; vp->v_vnlock = lowervp->v_vnlock; if (vp->v_vnlock == NULL) panic("null_nodeget: Passed a NULL vnlock.\n"); error = insmntque1(vp, mp, null_insmntque_dtr, xp); if (error != 0) return (error); /* * Atomically insert our new node into the hash or vget existing * if someone else has beaten us to it. */ *vpp = null_hashins(mp, xp); if (*vpp != NULL) { vrele(lowervp); vp->v_vnlock = &vp->v_lock; xp->null_lowervp = NULL; vrele(vp); return (0); } *vpp = vp; return (0); } /* * Remove node from hash. */ void null_hashrem(xp) struct null_node *xp; { mtx_lock(&null_hashmtx); LIST_REMOVE(xp, null_hash); mtx_unlock(&null_hashmtx); } #ifdef DIAGNOSTIC #ifdef KDB #define null_checkvp_barrier 1 #else #define null_checkvp_barrier 0 #endif struct vnode * null_checkvp(vp, fil, lno) struct vnode *vp; char *fil; int lno; { int interlock = 0; struct null_node *a = VTONULL(vp); #ifdef notyet /* * Can't do this check because vop_reclaim runs * with a funny vop vector. */ if (vp->v_op != null_vnodeop_p) { printf ("null_checkvp: on non-null-node\n"); while (null_checkvp_barrier) /*WAIT*/ ; panic("null_checkvp"); }; #endif if (a->null_lowervp == NULLVP) { /* Should never happen */ int i; u_long *p; printf("vp = %p, ZERO ptr\n", (void *)vp); for (p = (u_long *) a, i = 0; i < 8; i++) printf(" %lx", p[i]); printf("\n"); /* wait for debugger */ while (null_checkvp_barrier) /*WAIT*/ ; panic("null_checkvp"); } if (mtx_owned(VI_MTX(vp)) != 0) { VI_UNLOCK(vp); interlock = 1; } if (vrefcnt(a->null_lowervp) < 1) { int i; u_long *p; printf("vp = %p, unref'ed lowervp\n", (void *)vp); for (p = (u_long *) a, i = 0; i < 8; i++) printf(" %lx", p[i]); printf("\n"); /* wait for debugger */ while (null_checkvp_barrier) /*WAIT*/ ; panic ("null with unref'ed lowervp"); }; if (interlock != 0) VI_LOCK(vp); #ifdef notyet printf("null %x/%d -> %x/%d [%s, %d]\n", NULLTOV(a), vrefcnt(NULLTOV(a)), a->null_lowervp, vrefcnt(a->null_lowervp), fil, lno); #endif return a->null_lowervp; } #endif Index: head/sys/fs/nullfs/null_vfsops.c =================================================================== --- head/sys/fs/nullfs/null_vfsops.c (revision 175201) +++ head/sys/fs/nullfs/null_vfsops.c (revision 175202) @@ -1,370 +1,370 @@ /*- * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_vfsops.c 8.2 (Berkeley) 1/21/94 * * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 * $FreeBSD$ */ /* * Null Layer * (See null_vnops.c for a description of what this does.) */ #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NULLFSMNT, "nullfs_mount", "NULLFS mount structure"); static vfs_fhtovp_t nullfs_fhtovp; static vfs_mount_t nullfs_mount; static vfs_quotactl_t nullfs_quotactl; static vfs_root_t nullfs_root; static vfs_sync_t nullfs_sync; static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; /* * Mount null layer */ static int nullfs_mount(struct mount *mp, struct thread *td) { int error = 0; struct vnode *lowerrootvp, *vp; struct vnode *nullm_rootvp; struct null_mount *xmp; char *target; int isvnunlocked = 0, len; struct nameidata nd, *ndp = &nd; NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp); if (mp->mnt_flag & MNT_ROOTFS) return (EOPNOTSUPP); /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { /* * Only support update mounts for NFS export. */ if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) return (0); else return (EOPNOTSUPP); } /* * Get argument */ error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len); if (error || target[len - 1] != '\0') return (EINVAL); /* * Unlock lower node to avoid deadlock. * (XXX) VOP_ISLOCKED is needed? */ if ((mp->mnt_vnodecovered->v_op == &null_vnodeops) && VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) { VOP_UNLOCK(mp->mnt_vnodecovered, 0, td); isvnunlocked = 1; } /* * Find lower node */ NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, td); error = namei(ndp); /* * Re-lock vnode. */ if (isvnunlocked && !VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) - vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); /* * Sanity check on lower vnode */ lowerrootvp = ndp->ni_vp; /* * Check multi null mount to avoid `lock against myself' panic. */ if (lowerrootvp == VTONULL(mp->mnt_vnodecovered)->null_lowervp) { NULLFSDEBUG("nullfs_mount: multi null mount?\n"); vput(lowerrootvp); return (EDEADLK); } xmp = (struct null_mount *) malloc(sizeof(struct null_mount), M_NULLFSMNT, M_WAITOK); /* XXX */ /* * Save reference to underlying FS */ xmp->nullm_vfs = lowerrootvp->v_mount; /* * Save reference. Each mount also holds * a reference on the root vnode. */ error = null_nodeget(mp, lowerrootvp, &vp); /* * Make sure the node alias worked */ if (error) { VOP_UNLOCK(vp, 0, td); vrele(lowerrootvp); free(xmp, M_NULLFSMNT); /* XXX */ return (error); } /* * Keep a held reference to the root vnode. * It is vrele'd in nullfs_unmount. */ nullm_rootvp = vp; nullm_rootvp->v_vflag |= VV_ROOT; xmp->nullm_rootvp = nullm_rootvp; /* * Unlock the node (either the lower or the alias) */ VOP_UNLOCK(vp, 0, td); if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); } MNT_ILOCK(mp); mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE; MNT_IUNLOCK(mp); mp->mnt_data = xmp; vfs_getnewfsid(mp); vfs_mountedfrom(mp, target); NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); return (0); } /* * Free reference to null layer */ static int nullfs_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { void *mntdata; int error; int flags = 0; NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* There is 1 extra root vnode reference (nullm_rootvp). */ error = vflush(mp, 1, flags, td); if (error) return (error); /* * Finally, throw away the null_mount structure */ mntdata = mp->mnt_data; mp->mnt_data = 0; free(mntdata, M_NULLFSMNT); return 0; } static int nullfs_root(mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { struct vnode *vp; NULLFSDEBUG("nullfs_root(mp = %p, vp = %p->%p)\n", (void *)mp, (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp, (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp)); /* * Return locked reference to root. */ vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; VREF(vp); #ifdef NULLFS_DEBUG if (VOP_ISLOCKED(vp, NULL)) panic("root vnode is locked.\n"); #endif - vn_lock(vp, flags | LK_RETRY, td); + vn_lock(vp, flags | LK_RETRY); *vpp = vp; return 0; } static int nullfs_quotactl(mp, cmd, uid, arg, td) struct mount *mp; int cmd; uid_t uid; void *arg; struct thread *td; { return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td); } static int nullfs_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { int error; struct statfs mstat; NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p->%p)\n", (void *)mp, (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp, (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp)); bzero(&mstat, sizeof(mstat)); error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td); if (error) return (error); /* now copy across the "interesting" information and fake the rest */ sbp->f_type = mstat.f_type; sbp->f_flags = mstat.f_flags; sbp->f_bsize = mstat.f_bsize; sbp->f_iosize = mstat.f_iosize; sbp->f_blocks = mstat.f_blocks; sbp->f_bfree = mstat.f_bfree; sbp->f_bavail = mstat.f_bavail; sbp->f_files = mstat.f_files; sbp->f_ffree = mstat.f_ffree; return (0); } static int nullfs_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { /* * XXX - Assumes no data cached at null layer. */ return (0); } static int nullfs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { int error; error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp); if (error) return (error); return (null_nodeget(mp, *vpp, vpp)); } static int nullfs_fhtovp(mp, fidp, vpp) struct mount *mp; struct fid *fidp; struct vnode **vpp; { int error; error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, vpp); if (error) return (error); return (null_nodeget(mp, *vpp, vpp)); } static int nullfs_extattrctl(mp, cmd, filename_vp, namespace, attrname, td) struct mount *mp; int cmd; struct vnode *filename_vp; int namespace; const char *attrname; struct thread *td; { return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, filename_vp, namespace, attrname, td); } static struct vfsops null_vfsops = { .vfs_extattrctl = nullfs_extattrctl, .vfs_fhtovp = nullfs_fhtovp, .vfs_init = nullfs_init, .vfs_mount = nullfs_mount, .vfs_quotactl = nullfs_quotactl, .vfs_root = nullfs_root, .vfs_statfs = nullfs_statfs, .vfs_sync = nullfs_sync, .vfs_uninit = nullfs_uninit, .vfs_unmount = nullfs_unmount, .vfs_vget = nullfs_vget, }; VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK); Index: head/sys/fs/nwfs/nwfs_node.c =================================================================== --- head/sys/fs/nwfs/nwfs_node.c (revision 175201) +++ head/sys/fs/nwfs/nwfs_node.c (revision 175202) @@ -1,383 +1,383 @@ /*- * Copyright (c) 1999, 2000 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NWNOHASH(fhsum) (&nwhashtbl[(fhsum.f_id) & nwnodehash]) static LIST_HEAD(nwnode_hash_head,nwnode) *nwhashtbl; static u_long nwnodehash; static struct lock nwhashlock; static MALLOC_DEFINE(M_NWNODE, "nwfs_node", "NWFS vnode private part"); static MALLOC_DEFINE(M_NWFSHASH, "nwfs_hash", "NWFS has table"); static int nwfs_sysctl_vnprint(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_vfs_nwfs); SYSCTL_PROC(_vfs_nwfs, OID_AUTO, vnprint, CTLFLAG_WR|CTLTYPE_OPAQUE, NULL, 0, nwfs_sysctl_vnprint, "S,vnlist", "vnode hash"); void nwfs_hash_init(void) { nwhashtbl = hashinit(desiredvnodes, M_NWFSHASH, &nwnodehash); lockinit(&nwhashlock, PVFS, "nwfshl", 0, 0); } void nwfs_hash_free(void) { lockdestroy(&nwhashlock); free(nwhashtbl, M_NWFSHASH); } int nwfs_sysctl_vnprint(SYSCTL_HANDLER_ARGS) { struct nwnode *np; struct nwnode_hash_head *nhpp; struct vnode *vp; int i; if (nwfs_debuglevel == 0) return 0; printf("Name:uc:hc:fid:pfid\n"); for(i = 0; i <= nwnodehash; i++) { nhpp = &nwhashtbl[i]; LIST_FOREACH(np, nhpp, n_hash) { vp = NWTOV(np); vprint("", vp); printf("%s:%d:%d:%d:%d\n",np->n_name,vrefcnt(vp), vp->v_holdcnt,np->n_fid.f_id, np->n_fid.f_parent); } } return 0; } /* * Search nwnode with given fid. * Hash list should be locked by caller. */ static int nwfs_hashlookup(struct nwmount *nmp, ncpfid fid, struct nwnode **npp) { struct nwnode *np; struct nwnode_hash_head *nhpp; nhpp = NWNOHASH(fid); LIST_FOREACH(np, nhpp, n_hash) { if (nmp != np->n_mount || !NWCMPF(&fid, &np->n_fid)) continue; if (npp) *npp = np; return 0; } return ENOENT; } /* * Allocate new nwfsnode/vnode from given nwnode. * Vnode referenced and not locked. */ static int nwfs_allocvp(struct mount *mp, ncpfid fid, struct nw_entry_info *fap, struct vnode *dvp, struct vnode **vpp) { struct thread *td = curthread; /* XXX */ struct nwnode *np; struct nwnode_hash_head *nhpp; struct nwmount *nmp = VFSTONWFS(mp); struct vnode *vp; int error; loop: lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td); rescan: if (nwfs_hashlookup(nmp, fid, &np) == 0) { vp = NWTOV(np); mtx_lock(&vp->v_interlock); lockmgr(&nwhashlock, LK_RELEASE, NULL, td); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) goto loop; if (fap) np->n_attr = fap->attributes; *vpp = vp; return(0); } lockmgr(&nwhashlock, LK_RELEASE, NULL, td); if (fap == NULL || ((fap->attributes & aDIR) == 0 && dvp == NULL)) panic("nwfs_allocvp: fap = %p, dvp = %p\n", fap, dvp); /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ MALLOC(np, struct nwnode *, sizeof *np, M_NWNODE, M_WAITOK | M_ZERO); error = getnewvnode("nwfs", mp, &nwfs_vnodeops, &vp); if (error) { *vpp = NULL; FREE(np, M_NWNODE); return (error); } error = insmntque(vp, mp); /* XXX: Too early for mpsafe fs */ if (error != 0) { FREE(np, M_NWNODE); *vpp = NULL; return (error); } vp->v_data = np; np->n_vnode = vp; np->n_mount = nmp; np->n_attr = fap->attributes; vp->v_type = np->n_attr & aDIR ? VDIR : VREG; np->n_fid = fid; if (dvp) { np->n_parent = VTONW(dvp)->n_fid; } vp->v_vnlock->lk_flags |= LK_CANRECURSE; lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td); /* * Another process can create vnode while we blocked in malloc() or * getnewvnode(). Rescan list again. */ if (nwfs_hashlookup(nmp, fid, NULL) == 0) { vp->v_data = NULL; np->n_vnode = NULL; vrele(vp); FREE(np, M_NWNODE); goto rescan; } *vpp = vp; nhpp = NWNOHASH(fid); LIST_INSERT_HEAD(nhpp, np, n_hash); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); lockmgr(&nwhashlock, LK_RELEASE, NULL, td); ASSERT_VOP_LOCKED(dvp, "nwfs_allocvp"); if (vp->v_type == VDIR && dvp && (dvp->v_vflag & VV_ROOT) == 0) { np->n_flag |= NREFPARENT; vref(dvp); } return 0; } int nwfs_nget(struct mount *mp, ncpfid fid, struct nw_entry_info *fap, struct vnode *dvp, struct vnode **vpp) { struct vnode *vp; int error; *vpp = NULL; error = nwfs_allocvp(mp, fid, fap, dvp, &vp); if (error) return error; if (fap) nwfs_attr_cacheenter(vp, fap); *vpp = vp; return 0; } int nwfs_lookupnp(struct nwmount *nmp, ncpfid fid, struct thread *td, struct nwnode **npp) { int error; lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td); error = nwfs_hashlookup(nmp, fid, npp); lockmgr(&nwhashlock, LK_RELEASE, NULL, td); return error; } /* * Free nwnode, and give vnode back to system */ int nwfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *dvp = NULL, *vp = ap->a_vp; struct nwnode *dnp, *np = VTONW(vp); struct nwmount *nmp = VTONWFS(vp); struct thread *td = ap->a_td; NCPVNDEBUG("%s,%d\n", np->n_name, vrefcnt(vp)); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); if (np->n_flag & NREFPARENT) { np->n_flag &= ~NREFPARENT; if (nwfs_lookupnp(nmp, np->n_parent, td, &dnp) == 0) { dvp = dnp->n_vnode; } else { NCPVNDEBUG("%s: has no parent ?\n",np->n_name); } } lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td); LIST_REMOVE(np, n_hash); lockmgr(&nwhashlock, LK_RELEASE, NULL, td); if (nmp->n_root == np) { nmp->n_root = NULL; } vp->v_data = NULL; FREE(np, M_NWNODE); if (dvp) { vrele(dvp); } return (0); } int nwfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct thread *td = ap->a_td; struct ucred *cred = td->td_ucred; struct vnode *vp = ap->a_vp; struct nwnode *np = VTONW(vp); int error; NCPVNDEBUG("%s: %d\n", VTONW(vp)->n_name, vrefcnt(vp)); if (np->opened) { error = nwfs_vinvalbuf(vp, td); error = ncp_close_file(NWFSTOCONN(VTONWFS(vp)), &np->n_fh, td, cred); np->opened = 0; } if (np->n_flag & NSHOULDFREE) { cache_purge(vp); vgone(vp); } return (0); } /* * routines to maintain vnode attributes cache * nwfs_attr_cacheenter: unpack np.i to va structure */ void nwfs_attr_cacheenter(struct vnode *vp, struct nw_entry_info *fi) { struct nwnode *np = VTONW(vp); struct nwmount *nmp = VTONWFS(vp); struct vattr *va = &np->n_vattr; va->va_type = vp->v_type; /* vnode type (for create) */ np->n_nmlen = fi->nameLen; bcopy(fi->entryName, np->n_name, np->n_nmlen); np->n_name[fi->nameLen] = 0; if (vp->v_type == VREG) { if (va->va_size != fi->dataStreamSize) { va->va_size = fi->dataStreamSize; vnode_pager_setsize(vp, va->va_size); } va->va_mode = nmp->m.file_mode; /* files access mode and type */ } else if (vp->v_type == VDIR) { va->va_size = 16384; /* should be a better way ... */ va->va_mode = nmp->m.dir_mode; /* files access mode and type */ } else return; np->n_size = va->va_size; va->va_nlink = 1; /* number of references to file */ va->va_uid = nmp->m.uid; /* owner user id */ va->va_gid = nmp->m.gid; /* owner group id */ va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; va->va_fileid = np->n_fid.f_id; /* file id */ if (va->va_fileid == 0) va->va_fileid = NWFS_ROOT_INO; va->va_blocksize=nmp->connh->nh_conn->buffer_size;/* blocksize preferred for i/o */ /* time of last modification */ ncp_dos2unixtime(fi->modifyDate, fi->modifyTime, 0, nmp->m.tz, &va->va_mtime); /* time of last access */ ncp_dos2unixtime(fi->lastAccessDate, 0, 0, nmp->m.tz, &va->va_atime); va->va_ctime = va->va_mtime; /* time file changed */ va->va_gen = VNOVAL; /* generation number of file */ va->va_flags = 0; /* flags defined for file */ va->va_rdev = VNOVAL; /* device the special file represents */ va->va_bytes = va->va_size; /* bytes of disk space held by file */ va->va_filerev = 0; /* file modification number */ va->va_vaflags = 0; /* operations flags */ np->n_vattr = *va; if (np->n_mtime == 0) { np->n_mtime = va->va_mtime.tv_sec; } np->n_atime = time_second; np->n_dosfid = fi->DosDirNum; return; } int nwfs_attr_cachelookup(struct vnode *vp, struct vattr *va) { struct nwnode *np = VTONW(vp); int diff; diff = time_second - np->n_atime; if (diff > 2) { /* XXX should be configurable */ return ENOENT; } *va = np->n_vattr; return 0; } Index: head/sys/fs/nwfs/nwfs_vnops.c =================================================================== --- head/sys/fs/nwfs/nwfs_vnops.c (revision 175201) +++ head/sys/fs/nwfs/nwfs_vnops.c (revision 175202) @@ -1,981 +1,981 @@ /*- * Copyright (c) 1999, 2000, 2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for NWFS vnode operations */ static vop_create_t nwfs_create; static vop_mknod_t nwfs_mknod; static vop_open_t nwfs_open; static vop_close_t nwfs_close; static vop_access_t nwfs_access; static vop_getattr_t nwfs_getattr; static vop_setattr_t nwfs_setattr; static vop_read_t nwfs_read; static vop_write_t nwfs_write; static vop_fsync_t nwfs_fsync; static vop_remove_t nwfs_remove; static vop_link_t nwfs_link; static vop_lookup_t nwfs_lookup; static vop_rename_t nwfs_rename; static vop_mkdir_t nwfs_mkdir; static vop_rmdir_t nwfs_rmdir; static vop_symlink_t nwfs_symlink; static vop_readdir_t nwfs_readdir; static vop_strategy_t nwfs_strategy; static vop_print_t nwfs_print; static vop_pathconf_t nwfs_pathconf; /* Global vfs data structures for nwfs */ struct vop_vector nwfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = nwfs_access, .vop_close = nwfs_close, .vop_create = nwfs_create, .vop_fsync = nwfs_fsync, .vop_getattr = nwfs_getattr, .vop_getpages = nwfs_getpages, .vop_inactive = nwfs_inactive, .vop_ioctl = nwfs_ioctl, .vop_link = nwfs_link, .vop_lookup = nwfs_lookup, .vop_mkdir = nwfs_mkdir, .vop_mknod = nwfs_mknod, .vop_open = nwfs_open, .vop_pathconf = nwfs_pathconf, .vop_print = nwfs_print, .vop_putpages = nwfs_putpages, .vop_read = nwfs_read, .vop_readdir = nwfs_readdir, .vop_reclaim = nwfs_reclaim, .vop_remove = nwfs_remove, .vop_rename = nwfs_rename, .vop_rmdir = nwfs_rmdir, .vop_setattr = nwfs_setattr, .vop_strategy = nwfs_strategy, .vop_symlink = nwfs_symlink, .vop_write = nwfs_write, }; /* * nwfs_access vnode op */ static int nwfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *td; } */ *ap; { struct vnode *vp = ap->a_vp; mode_t mpmode; struct nwmount *nmp = VTONWFS(vp); NCPVNDEBUG("\n"); if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } mpmode = vp->v_type == VREG ? nmp->m.file_mode : nmp->m.dir_mode; return (vaccess(vp->v_type, mpmode, nmp->m.uid, nmp->m.gid, ap->a_mode, ap->a_cred, NULL)); } /* * nwfs_open vnode op */ /* ARGSUSED */ static int nwfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *td; } */ *ap; { struct vnode *vp = ap->a_vp; int mode = ap->a_mode; struct nwnode *np = VTONW(vp); struct ncp_open_info no; struct nwmount *nmp = VTONWFS(vp); struct vattr vattr; int error, nwm; NCPVNDEBUG("%s,%d\n", np->n_name, np->opened); if (vp->v_type != VREG && vp->v_type != VDIR) { NCPFATAL("open vtype = %d\n", vp->v_type); return (EACCES); } if (vp->v_type == VDIR) return 0; /* nothing to do now */ if (np->n_flag & NMODIFIED) { if ((error = nwfs_vinvalbuf(vp, ap->a_td)) == EINTR) return (error); np->n_atime = 0; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if ((error = nwfs_vinvalbuf(vp, ap->a_td)) == EINTR) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } if (np->opened) { np->opened++; return 0; } nwm = AR_READ; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) nwm |= AR_WRITE; error = ncp_open_create_file_or_subdir(nmp, vp, 0, NULL, OC_MODE_OPEN, 0, nwm, &no, ap->a_td, ap->a_cred); if (error) { if (mode & FWRITE) return EACCES; nwm = AR_READ; error = ncp_open_create_file_or_subdir(nmp, vp, 0, NULL, OC_MODE_OPEN, 0, nwm, &no, ap->a_td, ap->a_cred); } if (!error) { np->opened++; np->n_fh = no.fh; np->n_origfh = no.origfh; } np->n_atime = 0; return (error); } static int nwfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *td; } */ *ap; { struct vnode *vp = ap->a_vp; struct nwnode *np = VTONW(vp); int error; NCPVNDEBUG("name=%s,pid=%d,c=%d\n", np->n_name, ap->a_td->td_proc->p_pid, np->opened); if (vp->v_type == VDIR) return 0; /* nothing to do now */ error = 0; mtx_lock(&vp->v_interlock); if (np->opened == 0) { mtx_unlock(&vp->v_interlock); return 0; } mtx_unlock(&vp->v_interlock); error = nwfs_vinvalbuf(vp, ap->a_td); mtx_lock(&vp->v_interlock); if (np->opened == 0) { mtx_unlock(&vp->v_interlock); return 0; } if (--np->opened == 0) { mtx_unlock(&vp->v_interlock); error = ncp_close_file(NWFSTOCONN(VTONWFS(vp)), &np->n_fh, ap->a_td, ap->a_cred); } else mtx_unlock(&vp->v_interlock); np->n_atime = 0; return (error); } /* * nwfs_getattr call from vfs. */ static int nwfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *td; } */ *ap; { struct vnode *vp = ap->a_vp; struct nwnode *np = VTONW(vp); struct vattr *va=ap->a_vap; struct nwmount *nmp = VTONWFS(vp); struct nw_entry_info fattr; int error; u_int32_t oldsize; NCPVNDEBUG("%lx:%d: '%s' %d\n", (long)vp, nmp->n_volume, np->n_name, (vp->v_vflag & VV_ROOT) != 0); error = nwfs_attr_cachelookup(vp, va); if (!error) return 0; NCPVNDEBUG("not in cache\n"); oldsize = np->n_size; if (np->n_flag & NVOLUME) { error = ncp_obtain_info(nmp, np->n_fid.f_id, 0, NULL, &fattr, ap->a_td, ap->a_cred); } else { error = ncp_obtain_info(nmp, np->n_fid.f_parent, np->n_nmlen, np->n_name, &fattr, ap->a_td, ap->a_cred); } if (error) { NCPVNDEBUG("error %d\n", error); return error; } nwfs_attr_cacheenter(vp, &fattr); *va = np->n_vattr; if (np->opened) np->n_size = oldsize; return (0); } /* * nwfs_setattr call from vfs. */ static int nwfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *td; } */ *ap; { struct vnode *vp = ap->a_vp; struct nwnode *np = VTONW(vp); struct vattr *vap = ap->a_vap; u_quad_t tsize=0; int error = 0; NCPVNDEBUG("\n"); if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&(vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); vnode_pager_setsize(vp, (u_long)vap->va_size); tsize = np->n_size; np->n_size = vap->va_size; break; default: return EINVAL; }; } error = ncp_setattr(vp, vap, ap->a_cred, ap->a_td); if (error && vap->va_size != VNOVAL) { np->n_size = tsize; vnode_pager_setsize(vp, (u_long)tsize); } np->n_atime = 0; /* invalidate cache */ VOP_GETATTR(vp, vap, ap->a_cred, ap->a_td); np->n_mtime = vap->va_mtime.tv_sec; return (0); } /* * nwfs_read call. */ static int nwfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio=ap->a_uio; int error; NCPVNDEBUG("nwfs_read:\n"); if (vp->v_type != VREG && vp->v_type != VDIR) return (EPERM); error = nwfs_readvnode(vp, uio, ap->a_cred); return error; } static int nwfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; NCPVNDEBUG("%d,ofs=%d,sz=%d\n", vp->v_type, (int)uio->uio_offset, uio->uio_resid); if (vp->v_type != VREG) return (EPERM); error = nwfs_writevnode(vp, uio, ap->a_cred, ap->a_ioflag); return(error); } /* * nwfs_create call * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int nwfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct vnode **vpp=ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vnode *vp = (struct vnode *)0; int error = 0, fmode; struct vattr vattr; struct nwnode *np; struct ncp_open_info no; struct nwmount *nmp=VTONWFS(dvp); ncpfid fid; NCPVNDEBUG("\n"); *vpp = NULL; if (vap->va_type == VSOCK) return (EOPNOTSUPP); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) { return (error); } fmode = AR_READ | AR_WRITE; /* if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= AR_DENY_READ | AR_DENY_WRITE;*/ error = ncp_open_create_file_or_subdir(nmp, dvp, cnp->cn_namelen, cnp->cn_nameptr, OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, 0, fmode, &no, cnp->cn_thread, cnp->cn_cred); if (!error) { error = ncp_close_file(NWFSTOCONN(nmp), &no.fh, cnp->cn_thread, cnp->cn_cred); fid.f_parent = VTONW(dvp)->n_fid.f_id; fid.f_id = no.fattr.dirEntNum; error = nwfs_nget(VTOVFS(dvp), fid, &no.fattr, dvp, &vp); if (!error) { np = VTONW(vp); np->opened = 0; *vpp = vp; } if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, vp, cnp); } return (error); } /* * nwfs_remove call. It isn't possible to emulate UFS behaivour because * NetWare doesn't allow delete/rename operations on an opened file. */ static int nwfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct nwnode *np = VTONW(vp); struct nwmount *nmp = VTONWFS(vp); int error; if (vp->v_type == VDIR || np->opened || vrefcnt(vp) != 1) return EPERM; cache_purge(vp); error = ncp_DeleteNSEntry(nmp, VTONW(dvp)->n_fid.f_id, cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_thread, cnp->cn_cred); if (error == 0) np->n_flag |= NSHOULDFREE; else if (error == 0x899c) error = EACCES; return (error); } /* * nwfs_file rename call */ static int nwfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct nwmount *nmp=VTONWFS(fvp); u_int16_t oldtype = 6; int error=0; /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (tvp && vrefcnt(tvp) > 1) { error = EBUSY; goto out; } if (fvp->v_type == VDIR) { oldtype |= NW_TYPE_SUBDIR; } else if (fvp->v_type == VREG) { oldtype |= NW_TYPE_FILE; } else { error = EINVAL; goto out; } if (tvp && tvp != fvp) { error = ncp_DeleteNSEntry(nmp, VTONW(tdvp)->n_fid.f_id, tcnp->cn_namelen, tcnp->cn_nameptr, tcnp->cn_thread, tcnp->cn_cred); if (error == 0x899c) error = EACCES; if (error) goto out_cacherem; } error = ncp_nsrename(NWFSTOCONN(nmp), nmp->n_volume, nmp->name_space, oldtype, &nmp->m.nls, VTONW(fdvp)->n_fid.f_id, fcnp->cn_nameptr, fcnp->cn_namelen, VTONW(tdvp)->n_fid.f_id, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_thread, tcnp->cn_cred); if (error == 0x8992) error = EEXIST; if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out_cacherem: nwfs_attr_cacheremove(fdvp); nwfs_attr_cacheremove(tdvp); nwfs_attr_cacheremove(fvp); out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); if (tvp) nwfs_attr_cacheremove(tvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nwfs hard link create call * Netware filesystems don't know what links are. */ static int nwfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { return EOPNOTSUPP; } /* * nwfs_symlink link create call * Netware filesystems don't know what symlinks are. */ static int nwfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { return (EOPNOTSUPP); } static int nwfs_mknod(ap) struct vop_mknod_args /* { } */ *ap; { return (EOPNOTSUPP); } /* * nwfs_mkdir call */ static int nwfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; /* struct vattr *vap = ap->a_vap;*/ struct componentname *cnp = ap->a_cnp; int len=cnp->cn_namelen; struct ncp_open_info no; struct nwnode *np; struct vnode *newvp = (struct vnode *)0; ncpfid fid; int error = 0; struct vattr vattr; char *name=cnp->cn_nameptr; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) { return (error); } if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.')))) { return EEXIST; } if (ncp_open_create_file_or_subdir(VTONWFS(dvp), dvp, cnp->cn_namelen, cnp->cn_nameptr, OC_MODE_CREATE, aDIR, 0xffff, &no, cnp->cn_thread, cnp->cn_cred) != 0) { error = EACCES; } else { error = 0; } if (!error) { fid.f_parent = VTONW(dvp)->n_fid.f_id; fid.f_id = no.fattr.dirEntNum; error = nwfs_nget(VTOVFS(dvp), fid, &no.fattr, dvp, &newvp); if (!error) { np = VTONW(newvp); newvp->v_type = VDIR; *ap->a_vpp = newvp; } } return (error); } /* * nwfs_remove directory call */ static int nwfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct nwnode *np = VTONW(vp); struct nwmount *nmp = VTONWFS(vp); struct nwnode *dnp = VTONW(dvp); int error = EIO; if (dvp == vp) return EINVAL; error = ncp_DeleteNSEntry(nmp, dnp->n_fid.f_id, cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_thread, cnp->cn_cred); if (error == 0) np->n_flag |= NSHOULDFREE; else if (error == NWE_DIR_NOT_EMPTY) error = ENOTEMPTY; dnp->n_flag |= NMODIFIED; nwfs_attr_cacheremove(dvp); cache_purge(dvp); cache_purge(vp); return (error); } /* * nwfs_readdir call */ static int nwfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; if (vp->v_type != VDIR) return (EPERM); if (ap->a_ncookies) { printf("nwfs_readdir: no support for cookies now..."); return (EOPNOTSUPP); } error = nwfs_readvnode(vp, uio, ap->a_cred); return error; } /* ARGSUSED */ static int nwfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { /* return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/ return (0); } /* ARGSUSED */ static int nwfs_print (ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct nwnode *np = VTONW(vp); printf("\tnwfs node: name = '%s', fid = %d, pfid = %d\n", np->n_name, np->n_fid.f_id, np->n_fid.f_parent); return (0); } static int nwfs_pathconf (ap) struct vop_pathconf_args /* { struct vnode *vp; int name; register_t *retval; } */ *ap; { int name=ap->a_name, error=0; register_t *retval=ap->a_retval; switch(name){ case _PC_LINK_MAX: *retval=0; break; case _PC_NAME_MAX: *retval=NCP_MAX_FILENAME; /* XXX from nwfsnode */ break; case _PC_PATH_MAX: *retval=NCP_MAXPATHLEN; /* XXX from nwfsnode */ break; default: error=EINVAL; } return(error); } static int nwfs_strategy (ap) struct vop_strategy_args /* { struct buf *a_bp } */ *ap; { struct buf *bp=ap->a_bp; struct ucred *cr; struct thread *td; int error = 0; NCPVNDEBUG("\n"); if (bp->b_flags & B_ASYNC) td = (struct thread *)0; else td = curthread; /* XXX */ if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 ) error = nwfs_doio(ap->a_vp, bp, cr, td); return (error); } /* * How to keep the brain busy ... * Currently lookup routine can make two lookup for vnode. This can be * avoided by reorg the code. */ int nwfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int flags = cnp->cn_flags; struct vnode *vp; struct nwmount *nmp; struct mount *mp = dvp->v_mount; struct nwnode *dnp, *npp; struct nw_entry_info fattr, *fap; ncpfid fid; int nameiop=cnp->cn_nameiop, islastcn; int error = 0, notfound; struct thread *td = cnp->cn_thread; char _name[cnp->cn_namelen+1]; bcopy(cnp->cn_nameptr, _name, cnp->cn_namelen); _name[cnp->cn_namelen]=0; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) { printf("nwfs_lookup: invalid '..'\n"); return EIO; } NCPVNDEBUG("%d '%s' in '%s' id=d\n", nameiop, _name, VTONW(dvp)->n_name/*, VTONW(dvp)->n_name*/); islastcn = flags & ISLASTCN; if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP)) return (EROFS); if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td))) return (error); nmp = VFSTONWFS(mp); dnp = VTONW(dvp); /* printf("dvp %d:%d:%d\n", (int)mp, (int)dvp->v_vflag & VV_ROOT, (int)flags & ISDOTDOT); */ error = ncp_pathcheck(cnp->cn_nameptr, cnp->cn_namelen, &nmp->m.nls, (nameiop == CREATE || nameiop == RENAME) && (nmp->m.nls.opt & NWHP_NOSTRICT) == 0); if (error) return ENOENT; error = cache_lookup(dvp, vpp, cnp); NCPVNDEBUG("cache_lookup returned %d\n", error); if (error > 0) return error; if (error) { /* name was found */ struct vattr vattr; vp = *vpp; if (VOP_GETATTR(vp, &vattr, cnp->cn_cred, td) == 0 && vattr.va_ctime.tv_sec == VTONW(vp)->n_ctime) { if (nameiop != LOOKUP && islastcn) cnp->cn_flags |= SAVENAME; NCPVNDEBUG("use cached vnode"); return (0); } cache_purge(vp); if (vp != dvp) vput(vp); else vrele(vp); *vpp = NULLVP; } /* not in cache, so ... */ error = 0; *vpp = NULLVP; fap = NULL; if (flags & ISDOTDOT) { if (NWCMPF(&dnp->n_parent, &nmp->n_rootent)) { fid = nmp->n_rootent; fap = NULL; notfound = 0; } else { error = nwfs_lookupnp(nmp, dnp->n_parent, td, &npp); if (error) { return error; } fid = dnp->n_parent; fap = &fattr; /*np = *npp;*/ notfound = ncp_obtain_info(nmp, npp->n_dosfid, 0, NULL, fap, td, cnp->cn_cred); } } else { fap = &fattr; notfound = ncp_lookup(dvp, cnp->cn_namelen, cnp->cn_nameptr, fap, td, cnp->cn_cred); fid.f_id = fap->dirEntNum; if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { fid.f_parent = dnp->n_fid.f_parent; } else fid.f_parent = dnp->n_fid.f_id; NCPVNDEBUG("call to ncp_lookup returned=%d\n", notfound); } if (notfound && notfound < 0x80 ) return (notfound); /* hard error */ if (notfound) { /* entry not found */ /* Handle RENAME or CREATE case... */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return ENOENT; }/* else { NCPVNDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum); }*/ /* handle DELETE case ... */ if (nameiop == DELETE && islastcn) { /* delete last component */ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error) return (error); if (NWCMPF(&dnp->n_fid, &fid)) { /* we found ourselfs */ VREF(dvp); *vpp = dvp; return 0; } error = nwfs_nget(mp, fid, fap, dvp, &vp); if (error) return (error); *vpp = vp; cnp->cn_flags |= SAVENAME; /* I free it later */ return (0); } if (nameiop == RENAME && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error) return (error); if (NWCMPF(&dnp->n_fid, &fid)) return EISDIR; error = nwfs_nget(mp, fid, fap, dvp, &vp); if (error) return (error); *vpp = vp; cnp->cn_flags |= SAVENAME; return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); /* race to get the inode */ error = nwfs_nget(mp, fid, NULL, NULL, &vp); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); *vpp = vp; } else if (NWCMPF(&dnp->n_fid, &fid)) { vref(dvp); *vpp = dvp; } else { error = nwfs_nget(mp, fid, fap, dvp, &vp); if (error) return (error); *vpp = vp; NCPVNDEBUG("lookup: getnewvp!\n"); } if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) { VTONW(*vpp)->n_ctime = VTONW(*vpp)->n_vattr.va_ctime.tv_sec; cache_enter(dvp, *vpp, cnp); } return (0); } Index: head/sys/fs/portalfs/portal_vfsops.c =================================================================== --- head/sys/fs/portalfs/portal_vfsops.c (revision 175201) +++ head/sys/fs/portalfs/portal_vfsops.c (revision 175202) @@ -1,258 +1,258 @@ /*- * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)portal_vfsops.c 8.11 (Berkeley) 5/14/95 * * $FreeBSD$ */ /* * Portal Filesystem */ #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PORTALFSMNT, "portal_mount", "PORTAL mount structure"); static vfs_unmount_t portal_unmount; static vfs_root_t portal_root; static vfs_statfs_t portal_statfs; static const char *portal_opts[] = { "socket", "config", NULL }; static int portal_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct portal_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argf(ma, "socket", "%d", args.pa_socket); ma = mount_argsu(ma, "config", args.pa_config, MAXPATHLEN); error = kernel_mount(ma, flags); return (error); } /* * Mount the per-process file descriptors (/dev/fd) */ static int portal_mount(struct mount *mp, struct thread *td) { struct file *fp; struct portalmount *fmp; struct socket *so; struct vnode *rvp; struct portalnode *pn; int error, v; char *p; if (vfs_filteropt(mp->mnt_optnew, portal_opts)) return (EINVAL); error = vfs_scanopt(mp->mnt_optnew, "socket", "%d", &v); if (error != 1) return (EINVAL); error = vfs_getopt(mp->mnt_optnew, "config", (void **)&p, NULL); if (error) return (error); if ((error = fget(td, v, &fp)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); return(ENOTSOCK); } so = fp->f_data; /* XXX race against userland */ if (so->so_proto->pr_domain->dom_family != AF_UNIX) { fdrop(fp, td); return (ESOCKTNOSUPPORT); } MALLOC(pn, struct portalnode *, sizeof(struct portalnode), M_TEMP, M_WAITOK); MALLOC(fmp, struct portalmount *, sizeof(struct portalmount), M_PORTALFSMNT, M_WAITOK); /* XXX */ error = getnewvnode("portal", mp, &portal_vnodeops, &rvp); /* XXX */ if (error) { FREE(fmp, M_PORTALFSMNT); FREE(pn, M_TEMP); fdrop(fp, td); return (error); } error = insmntque(rvp, mp); /* XXX: Too early for mpsafe fs */ if (error != 0) { FREE(fmp, M_PORTALFSMNT); FREE(pn, M_TEMP); fdrop(fp, td); return (error); } rvp->v_data = pn; rvp->v_type = VDIR; rvp->v_vflag |= VV_ROOT; VTOPORTAL(rvp)->pt_arg = 0; VTOPORTAL(rvp)->pt_size = 0; VTOPORTAL(rvp)->pt_fileid = PORTAL_ROOTFILEID; fmp->pm_root = rvp; fhold(fp); fmp->pm_server = fp; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); mp->mnt_data = fmp; vfs_getnewfsid(mp); vfs_mountedfrom(mp, p); fdrop(fp, td); return (0); } static int portal_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* * Clear out buffer cache. I don't think we * ever get anything cached at this level at the * moment, but who knows... */ #ifdef notyet mntflushbuf(mp, 0); if (mntinvalbuf(mp, 1)) return (EBUSY); #endif /* There is 1 extra root vnode reference (pm_root). */ error = vflush(mp, 1, flags, td); if (error) return (error); /* * Shutdown the socket. This will cause the select in the * daemon to wake up, and then the accept will get ECONNABORTED * which it interprets as a request to go and bury itself. */ soshutdown(VFSTOPORTAL(mp)->pm_server->f_data, 2); /* * Discard reference to underlying file. Must call closef because * this may be the last reference. */ closef(VFSTOPORTAL(mp)->pm_server, (struct thread *) 0); /* * Finally, throw away the portalmount structure */ free(mp->mnt_data, M_PORTALFSMNT); /* XXX */ mp->mnt_data = 0; return (0); } static int portal_root(mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { struct vnode *vp; /* * Return locked reference to root. */ vp = VFSTOPORTAL(mp)->pm_root; VREF(vp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); *vpp = vp; return (0); } static int portal_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { sbp->f_flags = 0; sbp->f_bsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1K to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; /* Allow for "." */ sbp->f_ffree = 0; /* See comments above */ return (0); } static struct vfsops portal_vfsops = { .vfs_cmount = portal_cmount, .vfs_mount = portal_mount, .vfs_root = portal_root, .vfs_statfs = portal_statfs, .vfs_unmount = portal_unmount, }; VFS_SET(portal_vfsops, portalfs, VFCF_SYNTHETIC); Index: head/sys/fs/portalfs/portal_vnops.c =================================================================== --- head/sys/fs/portalfs/portal_vnops.c (revision 175201) +++ head/sys/fs/portalfs/portal_vnops.c (revision 175202) @@ -1,567 +1,566 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)portal_vnops.c 8.14 (Berkeley) 5/21/95 * * $FreeBSD$ */ /* * Portal Filesystem */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int portal_fileid = PORTAL_ROOTFILEID+1; static void portal_closefd(struct thread *td, int fd); static int portal_connect(struct socket *so, struct socket *so2); static vop_getattr_t portal_getattr; static vop_lookup_t portal_lookup; static vop_open_t portal_open; static vop_readdir_t portal_readdir; static vop_reclaim_t portal_reclaim; static vop_setattr_t portal_setattr; static void portal_closefd(td, fd) struct thread *td; int fd; { int error; error = kern_close(td, fd); /* * We should never get an error, and there isn't anything * we could do if we got one, so just print a message. */ if (error) printf("portal_closefd: error = %d\n", error); } /* * vp is the current namei directory * cnp is the name to locate in that directory... */ static int portal_lookup(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; char *pname = cnp->cn_nameptr; - struct thread *td = cnp->cn_thread; struct portalnode *pt; int error; struct vnode *fvp = 0; char *path; int size; *vpp = NULLVP; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) return (EROFS); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } KASSERT((cnp->cn_flags & ISDOTDOT) == 0, ("portal_lookup: Can not handle dotdot lookups.")); /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced * elsewhere if MALLOC should block. */ MALLOC(pt, struct portalnode *, sizeof(struct portalnode), M_TEMP, M_WAITOK); error = getnewvnode("portal", dvp->v_mount, &portal_vnodeops, &fvp); if (error) { FREE(pt, M_TEMP); goto bad; } fvp->v_type = VREG; fvp->v_data = pt; /* * Save all of the remaining pathname and * advance the namei next pointer to the end * of the string. */ for (size = 0, path = pname; *path; path++) size++; cnp->cn_consume = size - cnp->cn_namelen; pt->pt_arg = malloc(size+1, M_TEMP, M_WAITOK); pt->pt_size = size+1; bcopy(pname, pt->pt_arg, pt->pt_size); pt->pt_fileid = portal_fileid++; *vpp = fvp; - vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(fvp, dvp->v_mount); if (error != 0) { *vpp = NULLVP; return (error); } return (0); bad:; if (fvp) vrele(fvp); return (error); } static int portal_connect(so, so2) struct socket *so; struct socket *so2; { /* from unp_connect, bypassing the namei stuff... */ struct socket *so3; struct unpcb *unp2; struct unpcb *unp3; if (so2 == 0) return (ECONNREFUSED); if (so->so_type != so2->so_type) return (EPROTOTYPE); if ((so2->so_options & SO_ACCEPTCONN) == 0) return (ECONNREFUSED); if ((so3 = sonewconn(so2, 0)) == 0) return (ECONNREFUSED); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) sodupsockaddr((struct sockaddr *)unp2->unp_addr, M_NOWAIT); so2 = so3; return (uipc_connect2(so, so2)); } static int portal_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct socket *so = 0; struct portalnode *pt; struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct uio auio; struct iovec aiov[2]; int res; struct mbuf *cm = 0; struct cmsghdr *cmsg; int newfds; int *ip; int fd; int error; int len; struct portalmount *fmp; struct file *fp; struct portal_cred pcred; /* * Nothing to do when opening the root node. */ if (vp->v_vflag & VV_ROOT) return (0); /* * Can't be opened unless the caller is set up * to deal with the side effects. Check for this * by testing whether td_dupfd has been set. */ if (td->td_dupfd >= 0) return (ENODEV); pt = VTOPORTAL(vp); fmp = VFSTOPORTAL(vp->v_mount); /* * Create a new socket. */ error = socreate(AF_UNIX, &so, SOCK_STREAM, 0, ap->a_td->td_ucred, ap->a_td); if (error) goto bad; /* * Reserve some buffer space */ res = pt->pt_size + sizeof(pcred) + 512; /* XXX */ error = soreserve(so, res, res); if (error) goto bad; /* * Kick off connection */ error = portal_connect(so, fmp->pm_server->f_data); if (error) goto bad; /* * Wait for connection to complete */ /* * XXX: Since the mount point is holding a reference on the * underlying server socket, it is not easy to find out whether * the server process is still running. To handle this problem * we loop waiting for the new socket to be connected (something * which will only happen if the server is still running) or for * the reference count on the server socket to drop to 1, which * will happen if the server dies. Sleep for 5 second intervals * and keep polling the reference count. XXX. */ /* XXXRW: Locking? */ SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { if (fmp->pm_server->f_count == 1) { SOCK_UNLOCK(so); error = ECONNREFUSED; goto bad; } (void) msleep((caddr_t) &so->so_timeo, SOCK_MTX(so), PSOCK, "portalcon", 5 * hz); } SOCK_UNLOCK(so); if (so->so_error) { error = so->so_error; goto bad; } /* * Set miscellaneous flags */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_timeo = 0; so->so_rcv.sb_flags |= SB_NOINTR; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_timeo = 0; so->so_snd.sb_flags |= SB_NOINTR; SOCKBUF_UNLOCK(&so->so_snd); pcred.pcr_flag = ap->a_mode; pcred.pcr_uid = ap->a_cred->cr_uid; pcred.pcr_ngroups = ap->a_cred->cr_ngroups; bcopy(ap->a_cred->cr_groups, pcred.pcr_groups, NGROUPS * sizeof(gid_t)); aiov[0].iov_base = (caddr_t) &pcred; aiov[0].iov_len = sizeof(pcred); aiov[1].iov_base = pt->pt_arg; aiov[1].iov_len = pt->pt_size; auio.uio_iov = aiov; auio.uio_iovcnt = 2; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = aiov[0].iov_len + aiov[1].iov_len; error = sosend(so, (struct sockaddr *) 0, &auio, (struct mbuf *) 0, (struct mbuf *) 0, 0 , td); if (error) goto bad; len = auio.uio_resid = sizeof(int); do { struct mbuf *m = 0; int flags = MSG_WAITALL; error = soreceive(so, (struct sockaddr **) 0, &auio, &m, &cm, &flags); if (error) goto bad; /* * Grab an error code from the mbuf. */ if (m) { m = m_pullup(m, sizeof(int)); /* Needed? */ if (m) { error = *(mtod(m, int *)); m_freem(m); } else { error = EINVAL; } } else { if (cm == 0) { error = ECONNRESET; /* XXX */ #ifdef notdef break; #endif } } } while (cm == 0 && auio.uio_resid == len && !error); if (cm == 0) goto bad; if (auio.uio_resid) { error = 0; #ifdef notdef error = EMSGSIZE; goto bad; #endif } /* * XXX: Break apart the control message, and retrieve the * received file descriptor. Note that more than one descriptor * may have been received, or that the rights chain may have more * than a single mbuf in it. What to do? */ cmsg = mtod(cm, struct cmsghdr *); newfds = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof (int); if (newfds == 0) { error = ECONNREFUSED; goto bad; } /* * At this point the rights message consists of a control message * header, followed by a data region containing a vector of * integer file descriptors. The fds were allocated by the action * of receiving the control message. */ ip = (int *) (cmsg + 1); fd = *ip++; if (newfds > 1) { /* * Close extra fds. */ int i; printf("portal_open: %d extra fds\n", newfds - 1); for (i = 1; i < newfds; i++) { portal_closefd(td, *ip); ip++; } } /* * Check that the mode the file is being opened for is a subset * of the mode of the existing descriptor. */ if ((error = fget(td, fd, &fp)) != 0) goto bad; if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdrop(fp, td); portal_closefd(td, fd); error = EACCES; goto bad; } fdrop(fp, td); /* * Save the dup fd in the proc structure then return the * special error code (ENXIO) which causes magic things to * happen in vn_open. The whole concept is, well, hmmm. */ td->td_dupfd = fd; error = ENXIO; bad:; /* * And discard the control message. */ if (cm) { m_freem(cm); } if (so) { soshutdown(so, 2); soclose(so); } return (error); } static int portal_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; bzero(vap, sizeof(*vap)); vattr_null(vap); vap->va_uid = 0; vap->va_gid = 0; vap->va_size = DEV_BSIZE; vap->va_blocksize = DEV_BSIZE; nanotime(&vap->va_atime); vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; /* vap->va_qbytes = 0; */ vap->va_bytes = 0; /* vap->va_qsize = 0; */ if (vp->v_vflag & VV_ROOT) { vap->va_type = VDIR; vap->va_mode = S_IRUSR|S_IWUSR|S_IXUSR| S_IRGRP|S_IWGRP|S_IXGRP| S_IROTH|S_IWOTH|S_IXOTH; vap->va_nlink = 2; vap->va_fileid = 2; } else { vap->va_type = VREG; vap->va_mode = S_IRUSR|S_IWUSR| S_IRGRP|S_IWGRP| S_IROTH|S_IWOTH; vap->va_nlink = 1; vap->va_fileid = VTOPORTAL(vp)->pt_fileid; } return (0); } static int portal_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { /* * Can't mess with the root vnode */ if (ap->a_vp->v_vflag & VV_ROOT) return (EACCES); if (ap->a_vap->va_flags != VNOVAL) return (EOPNOTSUPP); return (0); } /* * Fake readdir, just return empty directory. * It is hard to deal with '.' and '..' so don't bother. */ static int portal_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { /* * We don't allow exporting portal mounts, and currently local * requests do not need cookies. */ if (ap->a_ncookies) panic("portal_readdir: not hungry"); return (0); } static int portal_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct portalnode *pt = VTOPORTAL(ap->a_vp); if (pt->pt_arg) { free((caddr_t) pt->pt_arg, M_TEMP); pt->pt_arg = 0; } FREE(ap->a_vp->v_data, M_TEMP); ap->a_vp->v_data = 0; return (0); } struct vop_vector portal_vnodeops = { .vop_default = &default_vnodeops, .vop_access = VOP_NULL, .vop_getattr = portal_getattr, .vop_lookup = portal_lookup, .vop_open = portal_open, .vop_pathconf = vop_stdpathconf, .vop_readdir = portal_readdir, .vop_reclaim = portal_reclaim, .vop_setattr = portal_setattr, }; Index: head/sys/fs/procfs/procfs.c =================================================================== --- head/sys/fs/procfs/procfs.c (revision 175201) +++ head/sys/fs/procfs/procfs.c (revision 175202) @@ -1,210 +1,210 @@ /*- * Copyright (c) 2001 Dag-Erling Smørgrav * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95 * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Filler function for proc/pid/self */ int procfs_doprocfile(PFS_FILL_ARGS) { char *fullpath = "unknown"; char *freepath = NULL; struct vnode *textvp; int err; textvp = p->p_textvp; VI_LOCK(textvp); vholdl(textvp); - err = vn_lock(textvp, LK_EXCLUSIVE | LK_INTERLOCK, td); + err = vn_lock(textvp, LK_EXCLUSIVE | LK_INTERLOCK); vdrop(textvp); if (err) return (err); vn_fullpath(td, textvp, &fullpath, &freepath); VOP_UNLOCK(textvp, 0, td); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/curproc */ int procfs_docurproc(PFS_FILL_ARGS) { sbuf_printf(sb, "%ld", (long)td->td_proc->p_pid); return (0); } /* * Adjust mode for some nodes that need it */ int procfs_attr(PFS_ATTR_ARGS) { PROC_LOCK_ASSERT(p, MA_OWNED); /* XXX inefficient, split into separate functions */ if (strcmp(pn->pn_name, "ctl") == 0 || strcmp(pn->pn_name, "note") == 0 || strcmp(pn->pn_name, "notepg") == 0) vap->va_mode = 0200; else if (strcmp(pn->pn_name, "mem") == 0 || strcmp(pn->pn_name, "regs") == 0 || strcmp(pn->pn_name, "dbregs") == 0 || strcmp(pn->pn_name, "fpregs") == 0) vap->va_mode = 0600; if ((p->p_flag & P_SUGID) && pn->pn_type != pfstype_procdir) vap->va_mode = 0; vap->va_uid = p->p_ucred->cr_uid; vap->va_gid = p->p_ucred->cr_gid; return (0); } /* * Visibility: some files only exist for non-system processes * Non-static because linprocfs uses it. */ int procfs_notsystem(PFS_VIS_ARGS) { PROC_LOCK_ASSERT(p, MA_OWNED); return ((p->p_flag & P_SYSTEM) == 0); } /* * Visibility: some files are only visible to process that can debug * the target process. */ int procfs_candebug(PFS_VIS_ARGS) { PROC_LOCK_ASSERT(p, MA_OWNED); return ((p->p_flag & P_SYSTEM) == 0 && p_candebug(td, p) == 0); } /* * Constructor */ static int procfs_init(PFS_INIT_ARGS) { struct pfs_node *root; struct pfs_node *dir; struct pfs_node *node; root = pi->pi_root; pfs_create_link(root, "curproc", procfs_docurproc, NULL, NULL, NULL, 0); dir = pfs_create_dir(root, "pid", procfs_attr, NULL, NULL, PFS_PROCDEP); pfs_create_file(dir, "cmdline", procfs_doproccmdline, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "ctl", procfs_doprocctl, procfs_attr, NULL, NULL, PFS_WR); pfs_create_file(dir, "dbregs", procfs_doprocdbregs, procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW); pfs_create_file(dir, "etype", procfs_doproctype, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "fpregs", procfs_doprocfpregs, procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW); pfs_create_file(dir, "map", procfs_doprocmap, NULL, procfs_notsystem, NULL, PFS_RD); node = pfs_create_file(dir, "mem", procfs_doprocmem, procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW); node->pn_ioctl = procfs_ioctl; node->pn_close = procfs_close; pfs_create_file(dir, "note", procfs_doprocnote, procfs_attr, procfs_candebug, NULL, PFS_WR); pfs_create_file(dir, "notepg", procfs_doprocnote, procfs_attr, procfs_candebug, NULL, PFS_WR); pfs_create_file(dir, "regs", procfs_doprocregs, procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW); pfs_create_file(dir, "rlimit", procfs_doprocrlimit, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "status", procfs_doprocstatus, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "file", procfs_doprocfile, NULL, procfs_notsystem, NULL, 0); return (0); } /* * Destructor */ static int procfs_uninit(PFS_INIT_ARGS) { /* nothing to do, pseudofs will GC */ return (0); } PSEUDOFS(procfs, 1); Index: head/sys/fs/procfs/procfs_map.c =================================================================== --- head/sys/fs/procfs/procfs_map.c (revision 175201) +++ head/sys/fs/procfs/procfs_map.c (revision 175202) @@ -1,244 +1,244 @@ /*- * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.3 (Berkeley) 2/17/94 * * $FreeBSD$ */ #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_IA32 #include #include #include extern struct sysentvec ia32_freebsd_sysvec; #endif #define MEBUFFERSIZE 256 /* * The map entries can *almost* be read with programs like cat. However, * large maps need special programs to read. It is not easy to implement * a program that can sense the required size of the buffer, and then * subsequently do a read with the appropriate size. This operation cannot * be atomic. The best that we can do is to allow the program to do a read * with an arbitrarily large buffer, and return as much as we can. We can * return an error code if the buffer is too small (EFBIG), then the program * can try a bigger buffer. */ int procfs_doprocmap(PFS_FILL_ARGS) { int len; int error, vfslocked; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry, tmp_entry; struct vnode *vp; char mebuffer[MEBUFFERSIZE]; char *fullpath, *freepath; unsigned int last_timestamp; #ifdef COMPAT_IA32 int wrap32 = 0; #endif PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); if (uio->uio_offset != 0) return (0); #ifdef COMPAT_IA32 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { if (p->p_sysent != &ia32_freebsd_sysvec) return (EOPNOTSUPP); wrap32 = 1; } #endif vm_map_lock_read(map); for (entry = map->header.next; ((uio->uio_resid > 0) && (entry != &map->header)); entry = entry->next) { vm_object_t obj, tobj, lobj; int ref_count, shadow_count, flags; vm_offset_t addr; int resident, privateresident; char *type; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; privateresident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); if (obj->shadow_count == 1) privateresident = obj->resident_page_count; } resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_LOCK(tobj); if (lobj != obj) VM_OBJECT_UNLOCK(lobj); lobj = tobj; } freepath = NULL; fullpath = "-"; if (lobj) { switch(lobj->type) { default: case OBJT_DEFAULT: type = "default"; vp = NULL; break; case OBJT_VNODE: type = "vnode"; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: type = "swap"; vp = NULL; break; case OBJT_DEVICE: type = "device"; vp = NULL; break; } if (lobj != obj) VM_OBJECT_UNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_UNLOCK(obj); if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vn_fullpath(td, vp, &fullpath, &freepath); vput(vp); VFS_UNLOCK_GIANT(vfslocked); } } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, resident, private resident, cow, access, type. */ snprintf(mebuffer, sizeof mebuffer, "0x%lx 0x%lx %d %d %p %s%s%s %d %d 0x%x %s %s %s %s\n", (u_long)entry->start, (u_long)entry->end, resident, privateresident, #ifdef COMPAT_IA32 wrap32 ? NULL : obj, /* Hide 64 bit value */ #else obj, #endif (entry->protection & VM_PROT_READ)?"r":"-", (entry->protection & VM_PROT_WRITE)?"w":"-", (entry->protection & VM_PROT_EXECUTE)?"x":"-", ref_count, shadow_count, flags, (entry->eflags & MAP_ENTRY_COW)?"COW":"NCOW", (entry->eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC", type, fullpath); if (freepath != NULL) free(freepath, M_TEMP); len = strlen(mebuffer); if (len > uio->uio_resid) { error = EFBIG; break; } last_timestamp = map->timestamp; vm_map_unlock_read(map); error = uiomove(mebuffer, len, uio); vm_map_lock_read(map); if (error) break; if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); return (error); } Index: head/sys/fs/pseudofs/pseudofs_vncache.c =================================================================== --- head/sys/fs/pseudofs/pseudofs_vncache.c (revision 175201) +++ head/sys/fs/pseudofs/pseudofs_vncache.c (revision 175202) @@ -1,308 +1,308 @@ /*- * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_pseudofs.h" #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache"); static struct mtx pfs_vncache_mutex; static struct pfs_vdata *pfs_vncache; static eventhandler_tag pfs_exit_tag; static void pfs_exit(void *arg, struct proc *p); SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW, 0, "pseudofs vnode cache"); static int pfs_vncache_entries; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD, &pfs_vncache_entries, 0, "number of entries in the vnode cache"); static int pfs_vncache_maxentries; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD, &pfs_vncache_maxentries, 0, "highest number of entries in the vnode cache"); static int pfs_vncache_hits; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD, &pfs_vncache_hits, 0, "number of cache hits since initialization"); static int pfs_vncache_misses; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD, &pfs_vncache_misses, 0, "number of cache misses since initialization"); extern struct vop_vector pfs_vnodeops; /* XXX -> .h file */ /* * Initialize vnode cache */ void pfs_vncache_load(void) { mtx_assert(&Giant, MA_OWNED); mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF); pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL, EVENTHANDLER_PRI_ANY); } /* * Tear down vnode cache */ void pfs_vncache_unload(void) { mtx_assert(&Giant, MA_OWNED); EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag); KASSERT(pfs_vncache_entries == 0, ("%d vncache entries remaining", pfs_vncache_entries)); mtx_destroy(&pfs_vncache_mutex); } /* * Allocate a vnode */ int pfs_vncache_alloc(struct mount *mp, struct vnode **vpp, struct pfs_node *pn, pid_t pid) { struct pfs_vdata *pvd; struct vnode *vp; int error; /* * See if the vnode is in the cache. * XXX linear search is not very efficient. */ retry: mtx_lock(&pfs_vncache_mutex); for (pvd = pfs_vncache; pvd; pvd = pvd->pvd_next) { if (pvd->pvd_pn == pn && pvd->pvd_pid == pid && pvd->pvd_vnode->v_mount == mp) { vp = pvd->pvd_vnode; VI_LOCK(vp); mtx_unlock(&pfs_vncache_mutex); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) { ++pfs_vncache_hits; *vpp = vp; /* * Some callers cache_enter(vp) later, so * we have to make sure it's not in the * VFS cache so it doesn't get entered * twice. A better solution would be to * make pfs_vncache_alloc() responsible * for entering the vnode in the VFS * cache. */ cache_purge(vp); return (0); } goto retry; } } mtx_unlock(&pfs_vncache_mutex); ++pfs_vncache_misses; /* nope, get a new one */ MALLOC(pvd, struct pfs_vdata *, sizeof *pvd, M_PFSVNCACHE, M_WAITOK); mtx_lock(&pfs_vncache_mutex); if (++pfs_vncache_entries > pfs_vncache_maxentries) pfs_vncache_maxentries = pfs_vncache_entries; mtx_unlock(&pfs_vncache_mutex); error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp); if (error) { mtx_lock(&pfs_vncache_mutex); --pfs_vncache_entries; mtx_unlock(&pfs_vncache_mutex); FREE(pvd, M_PFSVNCACHE); return (error); } pvd->pvd_pn = pn; pvd->pvd_pid = pid; (*vpp)->v_data = pvd; switch (pn->pn_type) { case pfstype_root: (*vpp)->v_vflag = VV_ROOT; #if 0 printf("root vnode allocated\n"); #endif /* fall through */ case pfstype_dir: case pfstype_this: case pfstype_parent: case pfstype_procdir: (*vpp)->v_type = VDIR; break; case pfstype_file: (*vpp)->v_type = VREG; break; case pfstype_symlink: (*vpp)->v_type = VLNK; break; case pfstype_none: KASSERT(0, ("pfs_vncache_alloc called for null node\n")); default: panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type); } /* * Propagate flag through to vnode so users know it can change * if the process changes (i.e. execve) */ if ((pn->pn_flags & PFS_PROCDEP) != 0) (*vpp)->v_vflag |= VV_PROCDEP; pvd->pvd_vnode = *vpp; (*vpp)->v_vnlock->lk_flags |= LK_CANRECURSE; - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(*vpp, mp); if (error != 0) { mtx_lock(&pfs_vncache_mutex); --pfs_vncache_entries; mtx_unlock(&pfs_vncache_mutex); FREE(pvd, M_PFSVNCACHE); *vpp = NULLVP; return (error); } mtx_lock(&pfs_vncache_mutex); pvd->pvd_prev = NULL; pvd->pvd_next = pfs_vncache; if (pvd->pvd_next) pvd->pvd_next->pvd_prev = pvd; pfs_vncache = pvd; mtx_unlock(&pfs_vncache_mutex); return (0); } /* * Free a vnode */ int pfs_vncache_free(struct vnode *vp) { struct pfs_vdata *pvd; mtx_lock(&pfs_vncache_mutex); pvd = (struct pfs_vdata *)vp->v_data; KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n")); if (pvd->pvd_next) pvd->pvd_next->pvd_prev = pvd->pvd_prev; if (pvd->pvd_prev) pvd->pvd_prev->pvd_next = pvd->pvd_next; else pfs_vncache = pvd->pvd_next; --pfs_vncache_entries; mtx_unlock(&pfs_vncache_mutex); FREE(pvd, M_PFSVNCACHE); vp->v_data = NULL; return (0); } /* * Purge the cache of dead entries * * This is extremely inefficient due to the fact that vgone() not only * indirectly modifies the vnode cache, but may also sleep. We can * neither hold pfs_vncache_mutex across a vgone() call, nor make any * assumptions about the state of the cache after vgone() returns. In * consequence, we must start over after every vgone() call, and keep * trying until we manage to traverse the entire cache. * * The only way to improve this situation is to change the data structure * used to implement the cache. */ void pfs_purge(struct pfs_node *pn) { struct pfs_vdata *pvd; struct vnode *vnp; mtx_lock(&pfs_vncache_mutex); pvd = pfs_vncache; while (pvd != NULL) { if (pvd->pvd_dead || (pn != NULL && pvd->pvd_pn == pn)) { vnp = pvd->pvd_vnode; vhold(vnp); mtx_unlock(&pfs_vncache_mutex); VOP_LOCK(vnp, LK_EXCLUSIVE, curthread); vgone(vnp); VOP_UNLOCK(vnp, 0, curthread); vdrop(vnp); mtx_lock(&pfs_vncache_mutex); pvd = pfs_vncache; } else { pvd = pvd->pvd_next; } } mtx_unlock(&pfs_vncache_mutex); } /* * Free all vnodes associated with a defunct process * * XXXRW: It is unfortunate that pfs_exit() always acquires and releases two * mutexes (one of which is Giant) for every process exit, even if procfs * isn't mounted. */ static void pfs_exit(void *arg, struct proc *p) { struct pfs_vdata *pvd; int dead; if (pfs_vncache == NULL) return; mtx_lock(&Giant); mtx_lock(&pfs_vncache_mutex); for (pvd = pfs_vncache, dead = 0; pvd != NULL; pvd = pvd->pvd_next) if (pvd->pvd_pid == p->p_pid) dead = pvd->pvd_dead = 1; mtx_unlock(&pfs_vncache_mutex); if (dead) pfs_purge(NULL); mtx_unlock(&Giant); } Index: head/sys/fs/pseudofs/pseudofs_vnops.c =================================================================== --- head/sys/fs/pseudofs/pseudofs_vnops.c (revision 175201) +++ head/sys/fs/pseudofs/pseudofs_vnops.c (revision 175202) @@ -1,892 +1,892 @@ /*- * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_pseudofs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Returns the fileno, adjusted for target pid */ static uint32_t pn_fileno(struct pfs_node *pn, pid_t pid) { KASSERT(pn->pn_fileno > 0, ("%s(): no fileno allocated", __func__)); if (pid != NO_PID) return (pn->pn_fileno * NO_PID + pid); return (pn->pn_fileno); } /* * Returns non-zero if given file is visible to given thread. */ static int pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc) { int visible; if (proc == NULL) return (0); PROC_LOCK_ASSERT(proc, MA_OWNED); visible = ((proc->p_flag & P_WEXIT) == 0); if (visible) visible = (p_cansee(td, proc) == 0); if (visible && pn->pn_vis != NULL) visible = pn_vis(td, proc, pn); if (!visible) return (0); return (1); } static int pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid, struct proc **p) { struct proc *proc; PFS_TRACE(("%s (pid: %d, req: %d)", pn->pn_name, pid, td->td_proc->p_pid)); if (p) *p = NULL; if (pid == NO_PID) PFS_RETURN (1); if ((proc = pfind(pid)) == NULL) PFS_RETURN (0); if (pfs_visible_proc(td, pn, proc)) { if (p) *p = proc; else PROC_UNLOCK(proc); PFS_RETURN (1); } PROC_UNLOCK(proc); PFS_RETURN (0); } /* * Verify permissions */ static int pfs_access(struct vop_access_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct vattr vattr; int error; PFS_TRACE(("%s", pvd->pvd_pn->pn_name)); (void)pvd; error = VOP_GETATTR(vn, &vattr, va->a_cred, va->a_td); if (error) PFS_RETURN (error); error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, va->a_mode, va->a_cred, NULL); PFS_RETURN (error); } /* * Close a file or directory */ static int pfs_close(struct vop_close_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct proc *proc; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); /* * Do nothing unless this is the last close and the node has a * last-close handler. */ if (vrefcnt(vn) > 1 || pn->pn_close == NULL) PFS_RETURN (0); if (pvd->pvd_pid != NO_PID) { proc = pfind(pvd->pvd_pid); } else { proc = NULL; } error = pn_close(va->a_td, proc, pn); if (proc != NULL) PROC_UNLOCK(proc); PFS_RETURN (error); } /* * Get file attributes */ static int pfs_getattr(struct vop_getattr_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct vattr *vap = va->a_vap; struct proc *proc; int error = 0; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (ENOENT); VATTR_NULL(vap); vap->va_type = vn->v_type; vap->va_fileid = pn_fileno(pn, pvd->pvd_pid); vap->va_flags = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0]; vap->va_nlink = 1; nanotime(&vap->va_ctime); vap->va_atime = vap->va_mtime = vap->va_ctime; switch (pn->pn_type) { case pfstype_procdir: case pfstype_root: case pfstype_dir: #if 0 pfs_lock(pn); /* compute link count */ pfs_unlock(pn); #endif vap->va_mode = 0555; break; case pfstype_file: case pfstype_symlink: vap->va_mode = 0444; break; default: printf("shouldn't be here!\n"); vap->va_mode = 0; break; } if (proc != NULL) { vap->va_uid = proc->p_ucred->cr_ruid; vap->va_gid = proc->p_ucred->cr_rgid; if (pn->pn_attr != NULL) error = pn_attr(va->a_td, proc, pn, vap); PROC_UNLOCK(proc); } else { vap->va_uid = 0; vap->va_gid = 0; } PFS_RETURN (error); } /* * Perform an ioctl */ static int pfs_ioctl(struct vop_ioctl_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct proc *proc; int error; PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); if (pn->pn_ioctl == NULL) PFS_RETURN (ENOTTY); /* * This is necessary because process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data); if (proc != NULL) PROC_UNLOCK(proc); PFS_RETURN (error); } /* * Perform getextattr */ static int pfs_getextattr(struct vop_getextattr_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct proc *proc; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (pn->pn_getextattr == NULL) error = EOPNOTSUPP; else error = pn_getextattr(curthread, proc, pn, va->a_attrnamespace, va->a_name, va->a_uio, va->a_size, va->a_cred); if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pn); PFS_RETURN (error); } /* * Look up a file or directory */ static int pfs_lookup(struct vop_cachedlookup_args *va) { struct vnode *vn = va->a_dvp; struct vnode **vpp = va->a_vpp; struct componentname *cnp = va->a_cnp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pd = pvd->pvd_pn; struct pfs_node *pn, *pdn = NULL; pid_t pid = pvd->pvd_pid; char *pname; int error, i, namelen, visible; PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr)); pfs_assert_not_owned(pd); if (vn->v_type != VDIR) PFS_RETURN (ENOTDIR); error = VOP_ACCESS(vn, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) PFS_RETURN (error); /* * Don't support DELETE or RENAME. CREATE is supported so * that O_CREAT will work, but the lookup will still fail if * the file does not exist. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) PFS_RETURN (EOPNOTSUPP); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= PFS_NAMELEN) PFS_RETURN (ENOENT); /* check that parent directory is visible... */ if (!pfs_visible(curthread, pd, pvd->pvd_pid, NULL)) PFS_RETURN (ENOENT); /* self */ namelen = cnp->cn_namelen; pname = cnp->cn_nameptr; if (namelen == 1 && pname[0] == '.') { pn = pd; *vpp = vn; VREF(vn); PFS_RETURN (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (pd->pn_type == pfstype_root) PFS_RETURN (EIO); VOP_UNLOCK(vn, 0, cnp->cn_thread); KASSERT(pd->pn_parent != NULL, ("%s(): non-root directory has no parent", __func__)); /* * This one is tricky. Descendents of procdir nodes * inherit their parent's process affinity, but * there's no easy reverse mapping. For simplicity, * we assume that if this node is a procdir, its * parent isn't (which is correct as long as * descendents of procdir nodes are never procdir * nodes themselves) */ if (pd->pn_type == pfstype_procdir) pid = NO_PID; pfs_lock(pd); pn = pd->pn_parent; pfs_unlock(pd); goto got_pnode; } pfs_lock(pd); /* named node */ for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next) if (pn->pn_type == pfstype_procdir) pdn = pn; else if (pn->pn_name[namelen] == '\0' && bcmp(pname, pn->pn_name, namelen) == 0) { pfs_unlock(pd); goto got_pnode; } /* process dependent node */ if ((pn = pdn) != NULL) { pid = 0; for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i) if ((pid = pid * 10 + pname[i] - '0') > PID_MAX) break; if (i == cnp->cn_namelen) { pfs_unlock(pd); goto got_pnode; } } pfs_unlock(pd); PFS_RETURN (ENOENT); got_pnode: pfs_assert_not_owned(pd); pfs_assert_not_owned(pn); visible = pfs_visible(curthread, pn, pid, NULL); if (!visible) { error = ENOENT; goto failed; } error = pfs_vncache_alloc(vn->v_mount, vpp, pn, pid); if (error) goto failed; if (cnp->cn_flags & ISDOTDOT) - vn_lock(vn, LK_EXCLUSIVE|LK_RETRY, cnp->cn_thread); + vn_lock(vn, LK_EXCLUSIVE|LK_RETRY); if (cnp->cn_flags & MAKEENTRY) cache_enter(vn, *vpp, cnp); PFS_RETURN (0); failed: if (cnp->cn_flags & ISDOTDOT) - vn_lock(vn, LK_EXCLUSIVE|LK_RETRY, cnp->cn_thread); + vn_lock(vn, LK_EXCLUSIVE|LK_RETRY); PFS_RETURN(error); } /* * Open a file or directory. */ static int pfs_open(struct vop_open_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; int mode = va->a_mode; PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode)); pfs_assert_not_owned(pn); /* check if the requested mode is permitted */ if (((mode & FREAD) && !(mode & PFS_RD)) || ((mode & FWRITE) && !(mode & PFS_WR))) PFS_RETURN (EPERM); /* we don't support locking */ if ((mode & O_SHLOCK) || (mode & O_EXLOCK)) PFS_RETURN (EOPNOTSUPP); PFS_RETURN (0); } /* * Read from a file */ static int pfs_read(struct vop_read_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc; struct sbuf *sb = NULL; int error; unsigned int buflen, offset, resid; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); if (!(pn->pn_flags & PFS_RD)) PFS_RETURN (EBADF); if (pn->pn_fill == NULL) PFS_RETURN (EIO); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (proc != NULL) { _PHOLD(proc); PROC_UNLOCK(proc); } if (pn->pn_flags & PFS_RAWRD) { PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid)); error = pn_fill(curthread, proc, pn, NULL, uio); PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid)); if (proc != NULL) PRELE(proc); PFS_RETURN (error); } /* beaucoup sanity checks so we don't ask for bogus allocation */ if (uio->uio_offset < 0 || uio->uio_resid < 0 || (offset = uio->uio_offset) != uio->uio_offset || (resid = uio->uio_resid) != uio->uio_resid || (buflen = offset + resid + 1) < offset || buflen > INT_MAX) { if (proc != NULL) PRELE(proc); PFS_RETURN (EINVAL); } if (buflen > MAXPHYS + 1) { if (proc != NULL) PRELE(proc); PFS_RETURN (EIO); } sb = sbuf_new(sb, NULL, buflen, 0); if (sb == NULL) { if (proc != NULL) PRELE(proc); PFS_RETURN (EIO); } error = pn_fill(curthread, proc, pn, sb, uio); if (proc != NULL) PRELE(proc); if (error) { sbuf_delete(sb); PFS_RETURN (error); } sbuf_finish(sb); error = uiomove_frombuf(sbuf_data(sb), sbuf_len(sb), uio); sbuf_delete(sb); PFS_RETURN (error); } /* * Iterate through directory entries */ static int pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd, struct pfs_node **pn, struct proc **p) { int visible; sx_assert(&allproc_lock, SX_SLOCKED); pfs_assert_owned(pd); again: if (*pn == NULL) { /* first node */ *pn = pd->pn_nodes; } else if ((*pn)->pn_type != pfstype_procdir) { /* next node */ *pn = (*pn)->pn_next; } if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) { /* next process */ if (*p == NULL) *p = LIST_FIRST(&allproc); else *p = LIST_NEXT(*p, p_list); /* out of processes: next node */ if (*p == NULL) *pn = (*pn)->pn_next; else PROC_LOCK(*p); } if ((*pn) == NULL) return (-1); if (*p != NULL) { visible = pfs_visible_proc(td, *pn, *p); PROC_UNLOCK(*p); } else if (proc != NULL) { visible = pfs_visible_proc(td, *pn, proc); } else { visible = 1; } if (!visible) goto again; return (0); } /* * Return directory entries. */ static int pfs_readdir(struct vop_readdir_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pd = pvd->pvd_pn; pid_t pid = pvd->pvd_pid; struct proc *p, *proc; struct pfs_node *pn; struct dirent *entry; struct uio *uio; off_t offset; int error, i, resid; char *buf, *ent; KASSERT(pd->pn_info == vn->v_mount->mnt_data, ("%s(): pn_info does not match mountpoint", __func__)); PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid)); pfs_assert_not_owned(pd); if (vn->v_type != VDIR) PFS_RETURN (ENOTDIR); uio = va->a_uio; /* only allow reading entire entries */ offset = uio->uio_offset; resid = uio->uio_resid; if (offset < 0 || offset % PFS_DELEN != 0 || (resid && resid < PFS_DELEN)) PFS_RETURN (EINVAL); if (resid == 0) PFS_RETURN (0); /* can't do this while holding the proc lock... */ buf = malloc(resid, M_IOV, M_WAITOK | M_ZERO); sx_slock(&allproc_lock); pfs_lock(pd); /* check if the directory is visible to the caller */ if (!pfs_visible(curthread, pd, pid, &proc)) { sx_sunlock(&allproc_lock); pfs_unlock(pd); free(buf, M_IOV); PFS_RETURN (ENOENT); } KASSERT(pid == NO_PID || proc != NULL, ("%s(): no process for pid %lu", __func__, (unsigned long)pid)); /* skip unwanted entries */ for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) { if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) { /* nothing left... */ if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pd); sx_sunlock(&allproc_lock); free(buf, M_IOV); PFS_RETURN (0); } } /* fill in entries */ ent = buf; while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 && resid >= PFS_DELEN) { entry = (struct dirent *)ent; entry->d_reclen = PFS_DELEN; entry->d_fileno = pn_fileno(pn, pid); /* PFS_DELEN was picked to fit PFS_NAMLEN */ for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i) entry->d_name[i] = pn->pn_name[i]; entry->d_name[i] = 0; entry->d_namlen = i; switch (pn->pn_type) { case pfstype_procdir: KASSERT(p != NULL, ("reached procdir node with p == NULL")); entry->d_namlen = snprintf(entry->d_name, PFS_NAMELEN, "%d", p->p_pid); /* fall through */ case pfstype_root: case pfstype_dir: case pfstype_this: case pfstype_parent: entry->d_type = DT_DIR; break; case pfstype_file: entry->d_type = DT_REG; break; case pfstype_symlink: entry->d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type); } PFS_TRACE(("%s", entry->d_name)); offset += PFS_DELEN; resid -= PFS_DELEN; ent += PFS_DELEN; } if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pd); sx_sunlock(&allproc_lock); PFS_TRACE(("%zd bytes", ent - buf)); error = uiomove(buf, ent - buf, uio); free(buf, M_IOV); PFS_RETURN (error); } /* * Read a symbolic link */ static int pfs_readlink(struct vop_readlink_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc = NULL; char buf[PATH_MAX]; struct sbuf sb; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VLNK) PFS_RETURN (EINVAL); if (pn->pn_fill == NULL) PFS_RETURN (EIO); if (pvd->pvd_pid != NO_PID) { if ((proc = pfind(pvd->pvd_pid)) == NULL) PFS_RETURN (EIO); if (proc->p_flag & P_WEXIT) { PROC_UNLOCK(proc); PFS_RETURN (EIO); } _PHOLD(proc); PROC_UNLOCK(proc); } /* sbuf_new() can't fail with a static buffer */ sbuf_new(&sb, buf, sizeof buf, 0); error = pn_fill(curthread, proc, pn, &sb, NULL); if (proc != NULL) PRELE(proc); if (error) { sbuf_delete(&sb); PFS_RETURN (error); } sbuf_finish(&sb); error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio); sbuf_delete(&sb); PFS_RETURN (error); } /* * Reclaim a vnode */ static int pfs_reclaim(struct vop_reclaim_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); return (pfs_vncache_free(va->a_vp)); } /* * Set attributes */ static int pfs_setattr(struct vop_setattr_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); PFS_RETURN (EOPNOTSUPP); } /* * Write to a file */ static int pfs_write(struct vop_write_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc; struct sbuf sb; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); KASSERT(pn->pn_type != pfstype_file, ("%s(): VREG vnode refers to non-file pfs_node", __func__)); if (!(pn->pn_flags & PFS_WR)) PFS_RETURN (EBADF); if (pn->pn_fill == NULL) PFS_RETURN (EIO); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (proc != NULL) { _PHOLD(proc); PROC_UNLOCK(proc); } if (pn->pn_flags & PFS_RAWWR) { pfs_lock(pn); error = pn_fill(curthread, proc, pn, NULL, uio); pfs_unlock(pn); if (proc != NULL) PRELE(proc); PFS_RETURN (error); } sbuf_uionew(&sb, uio, &error); if (error) { if (proc != NULL) PRELE(proc); PFS_RETURN (error); } error = pn_fill(curthread, proc, pn, &sb, uio); sbuf_delete(&sb); if (proc != NULL) PRELE(proc); PFS_RETURN (error); } /* * Vnode operations */ struct vop_vector pfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = pfs_access, .vop_cachedlookup = pfs_lookup, .vop_close = pfs_close, .vop_create = VOP_EOPNOTSUPP, .vop_getattr = pfs_getattr, .vop_getextattr = pfs_getextattr, .vop_ioctl = pfs_ioctl, .vop_link = VOP_EOPNOTSUPP, .vop_lookup = vfs_cache_lookup, .vop_mkdir = VOP_EOPNOTSUPP, .vop_mknod = VOP_EOPNOTSUPP, .vop_open = pfs_open, .vop_read = pfs_read, .vop_readdir = pfs_readdir, .vop_readlink = pfs_readlink, .vop_reclaim = pfs_reclaim, .vop_remove = VOP_EOPNOTSUPP, .vop_rename = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP, .vop_setattr = pfs_setattr, .vop_symlink = VOP_EOPNOTSUPP, .vop_write = pfs_write, /* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */ }; Index: head/sys/fs/smbfs/smbfs_io.c =================================================================== --- head/sys/fs/smbfs/smbfs_io.c (revision 175201) +++ head/sys/fs/smbfs/smbfs_io.c (revision 175202) @@ -1,711 +1,711 @@ /*- * Copyright (c) 2000-2001, Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include /* defines plimit structure in proc struct */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* #include */ #include #include #include #include #include #include /*#define SMBFS_RWGENERIC*/ extern int smbfs_pbuf_freecnt; static int smbfs_fastlookup = 1; SYSCTL_DECL(_vfs_smbfs); SYSCTL_INT(_vfs_smbfs, OID_AUTO, fastlookup, CTLFLAG_RW, &smbfs_fastlookup, 0, ""); #define DE_SIZE (sizeof(struct dirent)) static int smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred) { struct dirent de; struct componentname cn; struct smb_cred scred; struct smbfs_fctx *ctx; struct vnode *newvp; struct smbnode *np = VTOSMB(vp); int error/*, *eofflag = ap->a_eofflag*/; long offset, limit; np = VTOSMB(vp); SMBVDEBUG("dirname='%s'\n", np->n_name); smb_makescred(&scred, uio->uio_td, cred); offset = uio->uio_offset / DE_SIZE; /* offset in the directory */ limit = uio->uio_resid / DE_SIZE; if (uio->uio_resid < DE_SIZE || uio->uio_offset < 0) return EINVAL; while (limit && offset < 2) { limit--; bzero((caddr_t)&de, DE_SIZE); de.d_reclen = DE_SIZE; de.d_fileno = (offset == 0) ? np->n_ino : (np->n_parent ? VTOSMB(np->n_parent)->n_ino : 2); if (de.d_fileno == 0) de.d_fileno = 0x7ffffffd + offset; de.d_namlen = offset + 1; de.d_name[0] = '.'; de.d_name[1] = '.'; de.d_name[offset + 1] = '\0'; de.d_type = DT_DIR; error = uiomove(&de, DE_SIZE, uio); if (error) return error; offset++; uio->uio_offset += DE_SIZE; } if (limit == 0) return 0; if (offset != np->n_dirofs || np->n_dirseq == NULL) { SMBVDEBUG("Reopening search %ld:%ld\n", offset, np->n_dirofs); if (np->n_dirseq) { smbfs_findclose(np->n_dirseq, &scred); np->n_dirseq = NULL; } np->n_dirofs = 2; error = smbfs_findopen(np, "*", 1, SMB_FA_SYSTEM | SMB_FA_HIDDEN | SMB_FA_DIR, &scred, &ctx); if (error) { SMBVDEBUG("can not open search, error = %d", error); return error; } np->n_dirseq = ctx; } else ctx = np->n_dirseq; while (np->n_dirofs < offset) { error = smbfs_findnext(ctx, offset - np->n_dirofs++, &scred); if (error) { smbfs_findclose(np->n_dirseq, &scred); np->n_dirseq = NULL; return error == ENOENT ? 0 : error; } } error = 0; for (; limit; limit--, offset++) { error = smbfs_findnext(ctx, limit, &scred); if (error) break; np->n_dirofs++; bzero((caddr_t)&de, DE_SIZE); de.d_reclen = DE_SIZE; de.d_fileno = ctx->f_attr.fa_ino; de.d_type = (ctx->f_attr.fa_attr & SMB_FA_DIR) ? DT_DIR : DT_REG; de.d_namlen = ctx->f_nmlen; bcopy(ctx->f_name, de.d_name, de.d_namlen); de.d_name[de.d_namlen] = '\0'; if (smbfs_fastlookup) { error = smbfs_nget(vp->v_mount, vp, ctx->f_name, ctx->f_nmlen, &ctx->f_attr, &newvp); if (!error) { cn.cn_nameptr = de.d_name; cn.cn_namelen = de.d_namlen; cache_enter(vp, newvp, &cn); vput(newvp); } } error = uiomove(&de, DE_SIZE, uio); if (error) break; } if (error == ENOENT) error = 0; uio->uio_offset = offset * DE_SIZE; return error; } int smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred) { struct smbmount *smp = VFSTOSMBFS(vp->v_mount); struct smbnode *np = VTOSMB(vp); struct thread *td; struct vattr vattr; struct smb_cred scred; int error, lks; /* * Protect against method which is not supported for now */ if (uiop->uio_segflg == UIO_NOCOPY) return EOPNOTSUPP; if (vp->v_type != VREG && vp->v_type != VDIR) { SMBFSERR("vn types other than VREG or VDIR are unsupported !\n"); return EIO; } if (uiop->uio_resid == 0) return 0; if (uiop->uio_offset < 0) return EINVAL; /* if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize) return EFBIG;*/ td = uiop->uio_td; if (vp->v_type == VDIR) { lks = LK_EXCLUSIVE;/*lockstatus(vp->v_vnlock, td);*/ if (lks == LK_SHARED) - vn_lock(vp, LK_UPGRADE | LK_RETRY, td); + vn_lock(vp, LK_UPGRADE | LK_RETRY); error = smbfs_readvdir(vp, uiop, cred); if (lks == LK_SHARED) - vn_lock(vp, LK_DOWNGRADE | LK_RETRY, td); + vn_lock(vp, LK_DOWNGRADE | LK_RETRY); return error; } /* biosize = SSTOCN(smp->sm_share)->sc_txmax;*/ if (np->n_flag & NMODIFIED) { smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return error; if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) { error = smbfs_vinvalbuf(vp, td); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } } smb_makescred(&scred, td, cred); return smb_read(smp->sm_share, np->n_fid, uiop, &scred); } int smbfs_writevnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, int ioflag) { struct smbmount *smp = VTOSMBFS(vp); struct smbnode *np = VTOSMB(vp); struct smb_cred scred; struct proc *p; struct thread *td; int error = 0; if (vp->v_type != VREG) { SMBERROR("vn types other than VREG unsupported !\n"); return EIO; } SMBVDEBUG("ofs=%d,resid=%d\n",(int)uiop->uio_offset, uiop->uio_resid); if (uiop->uio_offset < 0) return EINVAL; /* if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize) return (EFBIG);*/ td = uiop->uio_td; p = td->td_proc; if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { smbfs_attr_cacheremove(vp); error = smbfs_vinvalbuf(vp, td); if (error) return error; } if (ioflag & IO_APPEND) { #ifdef notyet /* * File size can be changed by another client */ smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return (error); #endif uiop->uio_offset = np->n_size; } } if (uiop->uio_resid == 0) return 0; if (p != NULL) { PROC_LOCK(p); if (uiop->uio_offset + uiop->uio_resid > lim_cur(p, RLIMIT_FSIZE)) { psignal(p, SIGXFSZ); PROC_UNLOCK(p); return EFBIG; } PROC_UNLOCK(p); } smb_makescred(&scred, td, cred); error = smb_write(smp->sm_share, np->n_fid, uiop, &scred); SMBVDEBUG("after: ofs=%d,resid=%d\n",(int)uiop->uio_offset, uiop->uio_resid); if (!error) { if (uiop->uio_offset > np->n_size) { np->n_size = uiop->uio_offset; vnode_pager_setsize(vp, np->n_size); } } return error; } /* * Do an I/O operation to/from a cache block. */ int smbfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) { struct smbmount *smp = VFSTOSMBFS(vp->v_mount); struct smbnode *np = VTOSMB(vp); struct uio uio, *uiop = &uio; struct iovec io; struct smb_cred scred; int error = 0; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = td; smb_makescred(&scred, td, cr); if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; error = smb_read(smp->sm_share, np->n_fid, uiop, &scred); if (error) break; if (uiop->uio_resid) { int left = uiop->uio_resid; int nread = bp->b_bcount - left; if (left > 0) bzero((char *)bp->b_data + nread, left); } break; default: printf("smbfs_doio: type %x unexpected\n",vp->v_type); break; }; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } } else { /* write */ if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; error = smb_write(smp->sm_share, np->n_fid, uiop, &scred); /* * For an interrupted write, the buffer is still valid * and the write hasn't been pushed to the server yet, * so we can't set BIO_ERROR and report the interruption * by setting B_EINTR. For the B_ASYNC case, B_EINTR * is not relevant, so the rpc attempt is essentially * a noop. For the case of a V3 write rpc not being * committed to stable storage, the block is still * dirty and requires either a commit rpc or another * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; s = splbio(); bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; splx(s); } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return 0; } } bp->b_resid = uiop->uio_resid; bufdone(bp); return error; } /* * Vnode op for VM getpages. * Wish wish .... get rid from multiple IO routines */ int smbfs_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; vm_ooffset_t a_offset; } */ *ap; { #ifdef SMBFS_RWGENERIC return vop_stdgetpages(ap); #else int i, error, nextoff, size, toff, npages, count, reqpage; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; struct smbmount *smp; struct smbnode *np; struct smb_cred scred; vm_object_t object; vm_page_t *pages, m; vp = ap->a_vp; if ((object = vp->v_object) == NULL) { printf("smbfs_getpages: called with non-merged cache vnode??\n"); return VM_PAGER_ERROR; } td = curthread; /* XXX */ cred = td->td_ucred; /* XXX */ np = VTOSMB(vp); smp = VFSTOSMBFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; npages = btoc(count); reqpage = ap->a_reqpage; /* * If the requested page is partially valid, just return it and * allow the pager to zero-out the blanks. Partially valid pages * can only occur at the file EOF. */ m = pages[reqpage]; VM_OBJECT_LOCK(object); if (m->valid != 0) { /* handled by vm_fault now */ /* vm_page_zero_invalid(m, TRUE); */ vm_page_lock_queues(); for (i = 0; i < npages; ++i) { if (i != reqpage) vm_page_free(pages[i]); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return 0; } VM_OBJECT_UNLOCK(object); smb_makescred(&scred, td, cred); bp = getpbuf(&smbfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = smb_read(smp->sm_share, np->n_fid, &uio, &scred); pmap_qremove(kva, npages); relpbuf(bp, &smbfs_pbuf_freecnt); VM_OBJECT_LOCK(object); if (error && (uio.uio_resid == count)) { printf("smbfs_getpages: error %d\n",error); vm_page_lock_queues(); for (i = 0; i < npages; i++) { if (reqpage != i) vm_page_free(pages[i]); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return VM_PAGER_ERROR; } size = count - uio.uio_resid; vm_page_lock_queues(); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_validclean(m, 0, size - toff); /* handled by vm_fault now */ /* vm_page_zero_invalid(m, TRUE); */ } else { /* * Read operation was short. If no error occured * we may have hit a zero-fill section. We simply * leave valid set to 0. */ ; } if (i != reqpage) { /* * Whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error) { if (m->oflags & VPO_WANTED) vm_page_activate(m); else vm_page_deactivate(m); vm_page_wakeup(m); } else { vm_page_free(m); } } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return 0; #endif /* SMBFS_RWGENERIC */ } /* * Vnode op for VM putpages. * possible bug: all IO done in sync mode * Note that vop_close always invalidate pages before close, so it's * not necessary to open vnode. */ int smbfs_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; } */ *ap; { int error; struct vnode *vp = ap->a_vp; struct thread *td; struct ucred *cred; #ifdef SMBFS_RWGENERIC td = curthread; /* XXX */ cred = td->td_ucred; /* XXX */ VOP_OPEN(vp, FWRITE, cred, td, NULL); error = vop_stdputpages(ap); VOP_CLOSE(vp, FWRITE, cred, td); return error; #else struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int i, npages, count; int *rtvals; struct smbmount *smp; struct smbnode *np; struct smb_cred scred; vm_page_t *pages; td = curthread; /* XXX */ cred = td->td_ucred; /* XXX */ /* VOP_OPEN(vp, FWRITE, cred, td, NULL);*/ np = VTOSMB(vp); smp = VFSTOSMBFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); for (i = 0; i < npages; i++) { rtvals[i] = VM_PAGER_AGAIN; } bp = getpbuf(&smbfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, count); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; SMBVDEBUG("ofs=%d,resid=%d\n",(int)uio.uio_offset, uio.uio_resid); smb_makescred(&scred, td, cred); error = smb_write(smp->sm_share, np->n_fid, &uio, &scred); /* VOP_CLOSE(vp, FWRITE, cred, td);*/ SMBVDEBUG("paged write done: %d\n", error); pmap_qremove(kva, npages); relpbuf(bp, &smbfs_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; vm_page_lock_queues(); for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(pages[i]); } vm_page_unlock_queues(); } return rtvals[0]; #endif /* SMBFS_RWGENERIC */ } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int smbfs_vinvalbuf(struct vnode *vp, struct thread *td) { struct smbnode *np = VTOSMB(vp); int error = 0; if (vp->v_iflag & VI_DOOMED) return 0; while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz); error = smb_td_intr(td); if (error == EINTR) return EINTR; } np->n_flag |= NFLUSHINPROG; if (vp->v_bufobj.bo_object != NULL) { VM_OBJECT_LOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object); } error = vinvalbuf(vp, V_SAVE, td, PCATCH, 0); while (error) { if (error == ERESTART || error == EINTR) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup(&np->n_flag); } return EINTR; } error = vinvalbuf(vp, V_SAVE, td, PCATCH, 0); } np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup(&np->n_flag); } return (error); } Index: head/sys/fs/smbfs/smbfs_node.c =================================================================== --- head/sys/fs/smbfs/smbfs_node.c (revision 175201) +++ head/sys/fs/smbfs/smbfs_node.c (revision 175202) @@ -1,447 +1,447 @@ /*- * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#include #include */ #include #include #include #define SMBFS_NOHASH(smp, hval) (&(smp)->sm_hash[(hval) & (smp)->sm_hashlen]) #define smbfs_hash_lock(smp, td) lockmgr(&smp->sm_hashlock, LK_EXCLUSIVE, NULL, td) #define smbfs_hash_unlock(smp, td) lockmgr(&smp->sm_hashlock, LK_RELEASE, NULL, td) extern struct vop_vector smbfs_vnodeops; /* XXX -> .h file */ MALLOC_DEFINE(M_SMBNODE, "smbufs_node", "SMBFS vnode private part"); static MALLOC_DEFINE(M_SMBNODENAME, "smbufs_nname", "SMBFS node name"); int smbfs_hashprint(struct mount *mp); #if 0 #ifdef SYSCTL_DECL SYSCTL_DECL(_vfs_smbfs); #endif SYSCTL_PROC(_vfs_smbfs, OID_AUTO, vnprint, CTLFLAG_WR|CTLTYPE_OPAQUE, NULL, 0, smbfs_hashprint, "S,vnlist", "vnode hash"); #endif #define FNV_32_PRIME ((u_int32_t) 0x01000193UL) #define FNV1_32_INIT ((u_int32_t) 33554467UL) u_int32_t smbfs_hash(const u_char *name, int nmlen) { u_int32_t v; for (v = FNV1_32_INIT; nmlen; name++, nmlen--) { v *= FNV_32_PRIME; v ^= (u_int32_t)*name; } return v; } int smbfs_hashprint(struct mount *mp) { struct smbmount *smp = VFSTOSMBFS(mp); struct smbnode_hashhead *nhpp; struct smbnode *np; int i; for(i = 0; i <= smp->sm_hashlen; i++) { nhpp = &smp->sm_hash[i]; LIST_FOREACH(np, nhpp, n_hash) vprint("", SMBTOV(np)); } return 0; } static char * smbfs_name_alloc(const u_char *name, int nmlen) { u_char *cp; nmlen++; #ifdef SMBFS_NAME_DEBUG cp = malloc(nmlen + 2 + sizeof(int), M_SMBNODENAME, M_WAITOK); *(int*)cp = nmlen; cp += sizeof(int); cp[0] = 0xfc; cp++; bcopy(name, cp, nmlen - 1); cp[nmlen] = 0xfe; #else cp = malloc(nmlen, M_SMBNODENAME, M_WAITOK); bcopy(name, cp, nmlen - 1); #endif cp[nmlen - 1] = 0; return cp; } static void smbfs_name_free(u_char *name) { #ifdef SMBFS_NAME_DEBUG int nmlen, slen; u_char *cp; cp = name; cp--; if (*cp != 0xfc) panic("First byte of name entry '%s' corrupted", name); cp -= sizeof(int); nmlen = *(int*)cp; slen = strlen(name) + 1; if (nmlen != slen) panic("Name length mismatch: was %d, now %d name '%s'", nmlen, slen, name); if (name[nmlen] != 0xfe) panic("Last byte of name entry '%s' corrupted\n", name); free(cp, M_SMBNODENAME); #else free(name, M_SMBNODENAME); #endif } static int smbfs_node_alloc(struct mount *mp, struct vnode *dvp, const char *name, int nmlen, struct smbfattr *fap, struct vnode **vpp) { struct vattr vattr; struct thread *td = curthread; /* XXX */ struct smbmount *smp = VFSTOSMBFS(mp); struct smbnode_hashhead *nhpp; struct smbnode *np, *np2, *dnp; struct vnode *vp; u_long hashval; int error; *vpp = NULL; if (smp->sm_root != NULL && dvp == NULL) { SMBERROR("do not allocate root vnode twice!\n"); return EINVAL; } if (nmlen == 2 && bcmp(name, "..", 2) == 0) { if (dvp == NULL) return EINVAL; vp = VTOSMB(VTOSMB(dvp)->n_parent)->n_vnode; error = vget(vp, LK_EXCLUSIVE, td); if (error == 0) *vpp = vp; return error; } else if (nmlen == 1 && name[0] == '.') { SMBERROR("do not call me with dot!\n"); return EINVAL; } dnp = dvp ? VTOSMB(dvp) : NULL; if (dnp == NULL && dvp != NULL) { vprint("smbfs_node_alloc: dead parent vnode", dvp); return EINVAL; } hashval = smbfs_hash(name, nmlen); retry: smbfs_hash_lock(smp, td); loop: nhpp = SMBFS_NOHASH(smp, hashval); LIST_FOREACH(np, nhpp, n_hash) { vp = SMBTOV(np); if (np->n_parent != dvp || np->n_nmlen != nmlen || bcmp(name, np->n_name, nmlen) != 0) continue; VI_LOCK(vp); smbfs_hash_unlock(smp, td); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) goto retry; /* Force cached attributes to be refreshed if stale. */ (void)VOP_GETATTR(vp, &vattr, td->td_ucred, td); /* * If the file type on the server is inconsistent with * what it was when we created the vnode, kill the * bogus vnode now and fall through to the code below * to create a new one with the right type. */ if ((vp->v_type == VDIR && (np->n_dosattr & SMB_FA_DIR) == 0) || (vp->v_type == VREG && (np->n_dosattr & SMB_FA_DIR) != 0)) { vgone(vp); vput(vp); break; } *vpp = vp; return 0; } smbfs_hash_unlock(smp, td); /* * If we don't have node attributes, then it is an explicit lookup * for an existing vnode. */ if (fap == NULL) return ENOENT; MALLOC(np, struct smbnode *, sizeof *np, M_SMBNODE, M_WAITOK); error = getnewvnode("smbfs", mp, &smbfs_vnodeops, &vp); if (error) { FREE(np, M_SMBNODE); return error; } error = insmntque(vp, mp); /* XXX: Too early for mpsafe fs */ if (error != 0) { FREE(np, M_SMBNODE); return (error); } vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG; bzero(np, sizeof(*np)); vp->v_data = np; np->n_vnode = vp; np->n_mount = VFSTOSMBFS(mp); np->n_nmlen = nmlen; np->n_name = smbfs_name_alloc(name, nmlen); np->n_ino = fap->fa_ino; if (dvp) { ASSERT_VOP_LOCKED(dvp, "smbfs_node_alloc"); np->n_parent = dvp; if (/*vp->v_type == VDIR &&*/ (dvp->v_vflag & VV_ROOT) == 0) { vref(dvp); np->n_flag |= NREFPARENT; } } else if (vp->v_type == VREG) SMBERROR("new vnode '%s' born without parent ?\n", np->n_name); vp->v_vnlock->lk_flags |= LK_CANRECURSE; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); smbfs_hash_lock(smp, td); LIST_FOREACH(np2, nhpp, n_hash) { if (np2->n_parent != dvp || np2->n_nmlen != nmlen || bcmp(name, np2->n_name, nmlen) != 0) continue; vput(vp); /* smb_name_free(np->n_name); FREE(np, M_SMBNODE);*/ goto loop; } LIST_INSERT_HEAD(nhpp, np, n_hash); smbfs_hash_unlock(smp, td); *vpp = vp; return 0; } int smbfs_nget(struct mount *mp, struct vnode *dvp, const char *name, int nmlen, struct smbfattr *fap, struct vnode **vpp) { struct smbnode *np; struct vnode *vp; int error; *vpp = NULL; error = smbfs_node_alloc(mp, dvp, name, nmlen, fap, &vp); if (error) return error; np = VTOSMB(vp); if (fap) smbfs_attr_cacheenter(vp, fap); *vpp = vp; return 0; } /* * Free smbnode, and give vnode back to system */ int smbfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct vnode *dvp; struct smbnode *np = VTOSMB(vp); struct smbmount *smp = VTOSMBFS(vp); SMBVDEBUG("%s,%d\n", np->n_name, vrefcnt(vp)); KASSERT((np->n_flag & NOPEN) == 0, ("file not closed before reclaim")); smbfs_hash_lock(smp, td); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); dvp = (np->n_parent && (np->n_flag & NREFPARENT)) ? np->n_parent : NULL; if (np->n_hash.le_prev) LIST_REMOVE(np, n_hash); if (smp->sm_root == np) { SMBVDEBUG("root vnode\n"); smp->sm_root = NULL; } vp->v_data = NULL; smbfs_hash_unlock(smp, td); if (np->n_name) smbfs_name_free(np->n_name); FREE(np, M_SMBNODE); if (dvp != NULL) { vrele(dvp); /* * Indicate that we released something; see comment * in smbfs_unmount(). */ smp->sm_didrele = 1; } return 0; } int smbfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct thread *td = ap->a_td; struct ucred *cred = td->td_ucred; struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct smb_cred scred; struct vattr va; SMBVDEBUG("%s: %d\n", VTOSMB(vp)->n_name, vrefcnt(vp)); if ((np->n_flag & NOPEN) != 0) { smb_makescred(&scred, td, cred); smbfs_vinvalbuf(vp, td); if (vp->v_type == VREG) { VOP_GETATTR(vp, &va, cred, td); smbfs_smb_close(np->n_mount->sm_share, np->n_fid, &np->n_mtime, &scred); } else if (vp->v_type == VDIR) { if (np->n_dirseq != NULL) { smbfs_findclose(np->n_dirseq, &scred); np->n_dirseq = NULL; } } np->n_flag &= ~NOPEN; smbfs_attr_cacheremove(vp); } if (np->n_flag & NGONE) vrecycle(vp, td); return (0); } /* * routines to maintain vnode attributes cache * smbfs_attr_cacheenter: unpack np.i to vattr structure */ void smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap) { struct smbnode *np = VTOSMB(vp); if (vp->v_type == VREG) { if (np->n_size != fap->fa_size) { np->n_size = fap->fa_size; vnode_pager_setsize(vp, np->n_size); } } else if (vp->v_type == VDIR) { np->n_size = 16384; /* should be a better way ... */ } else return; np->n_mtime = fap->fa_mtime; np->n_dosattr = fap->fa_attr; np->n_attrage = time_second; return; } int smbfs_attr_cachelookup(struct vnode *vp, struct vattr *va) { struct smbnode *np = VTOSMB(vp); struct smbmount *smp = VTOSMBFS(vp); int diff; diff = time_second - np->n_attrage; if (diff > 2) /* XXX should be configurable */ return ENOENT; va->va_type = vp->v_type; /* vnode type (for create) */ if (vp->v_type == VREG) { va->va_mode = smp->sm_file_mode; /* files access mode and type */ if (np->n_dosattr & SMB_FA_RDONLY) va->va_mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH); } else if (vp->v_type == VDIR) { va->va_mode = smp->sm_dir_mode; /* files access mode and type */ } else return EINVAL; va->va_size = np->n_size; va->va_nlink = 1; /* number of references to file */ va->va_uid = smp->sm_uid; /* owner user id */ va->va_gid = smp->sm_gid; /* owner group id */ va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; va->va_fileid = np->n_ino; /* file id */ if (va->va_fileid == 0) va->va_fileid = 2; va->va_blocksize = SSTOVC(smp->sm_share)->vc_txmax; va->va_mtime = np->n_mtime; va->va_atime = va->va_ctime = va->va_mtime; /* time file changed */ va->va_gen = VNOVAL; /* generation number of file */ va->va_flags = 0; /* flags defined for file */ va->va_rdev = VNOVAL; /* device the special file represents */ va->va_bytes = va->va_size; /* bytes of disk space held by file */ va->va_filerev = 0; /* file modification number */ va->va_vaflags = 0; /* operations flags */ return 0; } Index: head/sys/fs/smbfs/smbfs_vnops.c =================================================================== --- head/sys/fs/smbfs/smbfs_vnops.c (revision 175201) +++ head/sys/fs/smbfs/smbfs_vnops.c (revision 175202) @@ -1,1271 +1,1271 @@ /*- * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for SMBFS vnode operations */ static vop_create_t smbfs_create; static vop_mknod_t smbfs_mknod; static vop_open_t smbfs_open; static vop_close_t smbfs_close; static vop_access_t smbfs_access; static vop_getattr_t smbfs_getattr; static vop_setattr_t smbfs_setattr; static vop_read_t smbfs_read; static vop_write_t smbfs_write; static vop_fsync_t smbfs_fsync; static vop_remove_t smbfs_remove; static vop_link_t smbfs_link; static vop_lookup_t smbfs_lookup; static vop_rename_t smbfs_rename; static vop_mkdir_t smbfs_mkdir; static vop_rmdir_t smbfs_rmdir; static vop_symlink_t smbfs_symlink; static vop_readdir_t smbfs_readdir; static vop_strategy_t smbfs_strategy; static vop_print_t smbfs_print; static vop_pathconf_t smbfs_pathconf; static vop_advlock_t smbfs_advlock; static vop_getextattr_t smbfs_getextattr; struct vop_vector smbfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = smbfs_access, .vop_advlock = smbfs_advlock, .vop_close = smbfs_close, .vop_create = smbfs_create, .vop_fsync = smbfs_fsync, .vop_getattr = smbfs_getattr, .vop_getextattr = smbfs_getextattr, .vop_getpages = smbfs_getpages, .vop_inactive = smbfs_inactive, .vop_ioctl = smbfs_ioctl, .vop_link = smbfs_link, .vop_lookup = smbfs_lookup, .vop_mkdir = smbfs_mkdir, .vop_mknod = smbfs_mknod, .vop_open = smbfs_open, .vop_pathconf = smbfs_pathconf, .vop_print = smbfs_print, .vop_putpages = smbfs_putpages, .vop_read = smbfs_read, .vop_readdir = smbfs_readdir, .vop_reclaim = smbfs_reclaim, .vop_remove = smbfs_remove, .vop_rename = smbfs_rename, .vop_rmdir = smbfs_rmdir, .vop_setattr = smbfs_setattr, /* .vop_setextattr = smbfs_setextattr,*/ .vop_strategy = smbfs_strategy, .vop_symlink = smbfs_symlink, .vop_write = smbfs_write, }; static int smbfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; mode_t mpmode; struct smbmount *smp = VTOSMBFS(vp); SMBVDEBUG("\n"); if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return EROFS; default: break; } } mpmode = vp->v_type == VREG ? smp->sm_file_mode : smp->sm_dir_mode; return (vaccess(vp->v_type, mpmode, smp->sm_uid, smp->sm_gid, ap->a_mode, ap->a_cred, NULL)); } /* ARGSUSED */ static int smbfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct smb_cred scred; struct vattr vattr; int mode = ap->a_mode; int error, accmode; SMBVDEBUG("%s,%d\n", np->n_name, (np->n_flag & NOPEN) != 0); if (vp->v_type != VREG && vp->v_type != VDIR) { SMBFSERR("open eacces vtype=%d\n", vp->v_type); return EACCES; } if (vp->v_type == VDIR) { np->n_flag |= NOPEN; return 0; } if (np->n_flag & NMODIFIED) { if ((error = smbfs_vinvalbuf(vp, ap->a_td)) == EINTR) return error; smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return error; if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) { error = smbfs_vinvalbuf(vp, ap->a_td); if (error == EINTR) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } } if ((np->n_flag & NOPEN) != 0) return 0; /* * Use DENYNONE to give unixy semantics of permitting * everything not forbidden by permissions. Ie denial * is up to server with clients/openers needing to use * advisory locks for further control. */ accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) accmode = SMB_SM_DENYNONE|SMB_AM_OPENRW; smb_makescred(&scred, ap->a_td, ap->a_cred); error = smbfs_smb_open(np, accmode, &scred); if (error) { if (mode & FWRITE) return EACCES; else if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD; error = smbfs_smb_open(np, accmode, &scred); } } if (error == 0) { np->n_flag |= NOPEN; vnode_create_vobject(ap->a_vp, vattr.va_size, ap->a_td); } smbfs_attr_cacheremove(vp); return error; } static int smbfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct smbnode *np = VTOSMB(vp); struct smb_cred scred; if (vp->v_type == VDIR && (np->n_flag & NOPEN) != 0 && np->n_dirseq != NULL) { smb_makescred(&scred, td, ap->a_cred); smbfs_findclose(np->n_dirseq, &scred); np->n_dirseq = NULL; } return 0; } /* * smbfs_getattr call from vfs. */ static int smbfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct vattr *va=ap->a_vap; struct smbfattr fattr; struct smb_cred scred; u_quad_t oldsize; int error; SMBVDEBUG("%lx: '%s' %d\n", (long)vp, np->n_name, (vp->v_vflag & VV_ROOT) != 0); error = smbfs_attr_cachelookup(vp, va); if (!error) return 0; SMBVDEBUG("not in the cache\n"); smb_makescred(&scred, ap->a_td, ap->a_cred); oldsize = np->n_size; error = smbfs_smb_lookup(np, NULL, 0, &fattr, &scred); if (error) { SMBVDEBUG("error %d\n", error); return error; } smbfs_attr_cacheenter(vp, &fattr); smbfs_attr_cachelookup(vp, va); if (np->n_flag & NOPEN) np->n_size = oldsize; return 0; } static int smbfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct vattr *vap = ap->a_vap; struct timespec *mtime, *atime; struct smb_cred scred; struct smb_share *ssp = np->n_mount->sm_share; struct smb_vc *vcp = SSTOVC(ssp); u_quad_t tsize = 0; int isreadonly, doclose, error = 0; int old_n_dosattr; SMBVDEBUG("\n"); if (vap->va_flags != VNOVAL) return EOPNOTSUPP; isreadonly = (vp->v_mount->mnt_flag & MNT_RDONLY); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && isreadonly) return EROFS; smb_makescred(&scred, ap->a_td, ap->a_cred); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return EISDIR; case VREG: break; default: return EINVAL; }; if (isreadonly) return EROFS; doclose = 0; vnode_pager_setsize(vp, (u_long)vap->va_size); tsize = np->n_size; np->n_size = vap->va_size; if ((np->n_flag & NOPEN) == 0) { error = smbfs_smb_open(np, SMB_SM_DENYNONE|SMB_AM_OPENRW, &scred); if (error == 0) doclose = 1; } if (error == 0) error = smbfs_smb_setfsize(np, vap->va_size, &scred); if (doclose) smbfs_smb_close(ssp, np->n_fid, NULL, &scred); if (error) { np->n_size = tsize; vnode_pager_setsize(vp, (u_long)tsize); return error; } } if (vap->va_mode != (mode_t)VNOVAL) { old_n_dosattr = np->n_dosattr; if (vap->va_mode & S_IWUSR) np->n_dosattr &= ~SMB_FA_RDONLY; else np->n_dosattr |= SMB_FA_RDONLY; if (np->n_dosattr != old_n_dosattr) { error = smbfs_smb_setpattr(np, np->n_dosattr, NULL, &scred); if (error) return error; } } mtime = atime = NULL; if (vap->va_mtime.tv_sec != VNOVAL) mtime = &vap->va_mtime; if (vap->va_atime.tv_sec != VNOVAL) atime = &vap->va_atime; if (mtime != atime) { if (vap->va_vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td); if (error) error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td); } else error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td); #if 0 if (mtime == NULL) mtime = &np->n_mtime; if (atime == NULL) atime = &np->n_atime; #endif /* * If file is opened, then we can use handle based calls. * If not, use path based ones. */ if ((np->n_flag & NOPEN) == 0) { if (vcp->vc_flags & SMBV_WIN95) { error = VOP_OPEN(vp, FWRITE, ap->a_cred, ap->a_td, NULL); if (!error) { /* error = smbfs_smb_setfattrNT(np, 0, mtime, atime, &scred); VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);*/ if (mtime) np->n_mtime = *mtime; VOP_CLOSE(vp, FWRITE, ap->a_cred, ap->a_td); } } else if ((vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS)) { error = smbfs_smb_setptime2(np, mtime, atime, 0, &scred); /* error = smbfs_smb_setpattrNT(np, 0, mtime, atime, &scred);*/ } else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN2_0) { error = smbfs_smb_setptime2(np, mtime, atime, 0, &scred); } else { error = smbfs_smb_setpattr(np, 0, mtime, &scred); } } else { if (vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS) { error = smbfs_smb_setfattrNT(np, 0, mtime, atime, &scred); } else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN1_0) { error = smbfs_smb_setftime(np, mtime, atime, &scred); } else { /* * I have no idea how to handle this for core * level servers. The possible solution is to * update mtime after file is closed. */ SMBERROR("can't update times on an opened file\n"); } } } /* * Invalidate attribute cache in case if server doesn't set * required attributes. */ smbfs_attr_cacheremove(vp); /* invalidate cache */ VOP_GETATTR(vp, vap, ap->a_cred, ap->a_td); np->n_mtime.tv_sec = vap->va_mtime.tv_sec; return error; } /* * smbfs_read call. */ static int smbfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; SMBVDEBUG("\n"); if (vp->v_type != VREG && vp->v_type != VDIR) return EPERM; return smbfs_readvnode(vp, uio, ap->a_cred); } static int smbfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; SMBVDEBUG("%d,ofs=%d,sz=%d\n",vp->v_type, (int)uio->uio_offset, uio->uio_resid); if (vp->v_type != VREG) return (EPERM); return smbfs_writevnode(vp, uio, ap->a_cred,ap->a_ioflag); } /* * smbfs_create call * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int smbfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct vnode **vpp=ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct smbnode *dnp = VTOSMB(dvp); struct vnode *vp; struct vattr vattr; struct smbfattr fattr; struct smb_cred scred; char *name = cnp->cn_nameptr; int nmlen = cnp->cn_namelen; int error; SMBVDEBUG("\n"); *vpp = NULL; if (vap->va_type != VREG) return EOPNOTSUPP; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) return error; smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred); error = smbfs_smb_create(dnp, name, nmlen, &scred); if (error) return error; error = smbfs_smb_lookup(dnp, name, nmlen, &fattr, &scred); if (error) return error; error = smbfs_nget(VTOVFS(dvp), dvp, name, nmlen, &fattr, &vp); if (error) return error; *vpp = vp; if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, vp, cnp); return error; } static int smbfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; /* struct vnode *dvp = ap->a_dvp;*/ struct componentname *cnp = ap->a_cnp; struct smbnode *np = VTOSMB(vp); struct smb_cred scred; int error; if (vp->v_type == VDIR || (np->n_flag & NOPEN) != 0 || vrefcnt(vp) != 1) return EPERM; smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred); error = smbfs_smb_delete(np, &scred); if (error == 0) np->n_flag |= NGONE; cache_purge(vp); return error; } /* * smbfs_file rename call */ static int smbfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; /* struct componentname *fcnp = ap->a_fcnp;*/ struct smb_cred scred; u_int16_t flags = 6; int error=0; /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (tvp && vrefcnt(tvp) > 1) { error = EBUSY; goto out; } flags = 0x10; /* verify all writes */ if (fvp->v_type == VDIR) { flags |= 2; } else if (fvp->v_type == VREG) { flags |= 1; } else { error = EINVAL; goto out; } smb_makescred(&scred, tcnp->cn_thread, tcnp->cn_cred); /* * It seems that Samba doesn't implement SMB_COM_MOVE call... */ #ifdef notnow if (SMB_DIALECT(SSTOCN(smp->sm_share)) >= SMB_DIALECT_LANMAN1_0) { error = smbfs_smb_move(VTOSMB(fvp), VTOSMB(tdvp), tcnp->cn_nameptr, tcnp->cn_namelen, flags, &scred); } else #endif { /* * We have to do the work atomicaly */ if (tvp && tvp != fvp) { error = smbfs_smb_delete(VTOSMB(tvp), &scred); if (error) goto out_cacherem; VTOSMB(fvp)->n_flag |= NGONE; } error = smbfs_smb_rename(VTOSMB(fvp), VTOSMB(tdvp), tcnp->cn_nameptr, tcnp->cn_namelen, &scred); } if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out_cacherem: smbfs_attr_cacheremove(fdvp); smbfs_attr_cacheremove(tdvp); out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); #ifdef possible_mistake vgone(fvp); if (tvp) vgone(tvp); #endif return error; } /* * somtime it will come true... */ static int smbfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { return EOPNOTSUPP; } /* * smbfs_symlink link create call. * Sometime it will be functional... */ static int smbfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { return EOPNOTSUPP; } static int smbfs_mknod(ap) struct vop_mknod_args /* { } */ *ap; { return EOPNOTSUPP; } static int smbfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; /* struct vattr *vap = ap->a_vap;*/ struct vnode *vp; struct componentname *cnp = ap->a_cnp; struct smbnode *dnp = VTOSMB(dvp); struct vattr vattr; struct smb_cred scred; struct smbfattr fattr; char *name = cnp->cn_nameptr; int len = cnp->cn_namelen; int error; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) { return error; } if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.')))) return EEXIST; smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred); error = smbfs_smb_mkdir(dnp, name, len, &scred); if (error) return error; error = smbfs_smb_lookup(dnp, name, len, &fattr, &scred); if (error) return error; error = smbfs_nget(VTOVFS(dvp), dvp, name, len, &fattr, &vp); if (error) return error; *ap->a_vpp = vp; return 0; } /* * smbfs_remove directory call */ static int smbfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; /* struct smbmount *smp = VTOSMBFS(vp);*/ struct smbnode *dnp = VTOSMB(dvp); struct smbnode *np = VTOSMB(vp); struct smb_cred scred; int error; if (dvp == vp) return EINVAL; smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred); error = smbfs_smb_rmdir(np, &scred); if (error == 0) np->n_flag |= NGONE; dnp->n_flag |= NMODIFIED; smbfs_attr_cacheremove(dvp); /* cache_purge(dvp);*/ cache_purge(vp); return error; } /* * smbfs_readdir call */ static int smbfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; if (vp->v_type != VDIR) return (EPERM); #ifdef notnow if (ap->a_ncookies) { printf("smbfs_readdir: no support for cookies now..."); return (EOPNOTSUPP); } #endif error = smbfs_readvnode(vp, uio, ap->a_cred); return error; } /* ARGSUSED */ static int smbfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct thread * a_td; } */ *ap; { /* return (smb_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/ return (0); } static int smbfs_print (ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); if (np == NULL) { printf("no smbnode data\n"); return (0); } printf("\tname = %s, parent = %p, open = %d\n", np->n_name, np->n_parent ? np->n_parent : NULL, (np->n_flag & NOPEN) != 0); return (0); } static int smbfs_pathconf (ap) struct vop_pathconf_args /* { struct vnode *vp; int name; register_t *retval; } */ *ap; { struct smbmount *smp = VFSTOSMBFS(VTOVFS(ap->a_vp)); struct smb_vc *vcp = SSTOVC(smp->sm_share); register_t *retval = ap->a_retval; int error = 0; switch (ap->a_name) { case _PC_LINK_MAX: *retval = 0; break; case _PC_NAME_MAX: *retval = (vcp->vc_hflags2 & SMB_FLAGS2_KNOWS_LONG_NAMES) ? 255 : 12; break; case _PC_PATH_MAX: *retval = 800; /* XXX: a correct one ? */ break; default: error = EINVAL; } return error; } static int smbfs_strategy (ap) struct vop_strategy_args /* { struct buf *a_bp } */ *ap; { struct buf *bp=ap->a_bp; struct ucred *cr; struct thread *td; int error = 0; SMBVDEBUG("\n"); if (bp->b_flags & B_ASYNC) td = (struct thread *)0; else td = curthread; /* XXX */ if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; if ((bp->b_flags & B_ASYNC) == 0 ) error = smbfs_doio(ap->a_vp, bp, cr, td); return error; } int smbfs_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int fflag; struct ucred *cred; struct thread *td; } */ *ap; { return ENOTTY; } static char smbfs_atl[] = "rhsvda"; static int smbfs_getextattr(struct vop_getextattr_args *ap) /* { IN struct vnode *a_vp; IN char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; struct uio *uio = ap->a_uio; const char *name = ap->a_name; struct smbnode *np = VTOSMB(vp); struct vattr vattr; char buf[10]; int i, attr, error; error = VOP_ACCESS(vp, VREAD, cred, td); if (error) return error; error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return error; if (strcmp(name, "dosattr") == 0) { attr = np->n_dosattr; for (i = 0; i < 6; i++, attr >>= 1) buf[i] = (attr & 1) ? smbfs_atl[i] : '-'; buf[i] = 0; error = uiomove(buf, i, uio); } else error = EINVAL; return error; } /* * Since we expected to support F_GETLK (and SMB protocol has no such function), * it is necessary to use lf_advlock(). It would be nice if this function had * a callback mechanism because it will help to improve a level of consistency. */ int smbfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct flock *fl = ap->a_fl; caddr_t id = (caddr_t)1 /* ap->a_id */; /* int flags = ap->a_flags;*/ struct thread *td = curthread; struct smb_cred scred; u_quad_t size; off_t start, end, oadd; int error, lkop; if (vp->v_type == VDIR) { /* * SMB protocol have no support for directory locking. * Although locks can be processed on local machine, I don't * think that this is a good idea, because some programs * can work wrong assuming directory is locked. So, we just * return 'operation not supported */ return EOPNOTSUPP; } size = np->n_size; switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: start = fl->l_start; break; case SEEK_END: if (size > OFF_MAX || (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) return EOVERFLOW; start = size + fl->l_start; break; default: return EINVAL; } if (start < 0) return EINVAL; if (fl->l_len < 0) { if (start == 0) return EINVAL; end = start - 1; start += fl->l_len; if (start < 0) return EINVAL; } else if (fl->l_len == 0) end = -1; else { oadd = fl->l_len - 1; if (oadd > OFF_MAX - start) return EOVERFLOW; end = start + oadd; } smb_makescred(&scred, td, td->td_ucred); switch (ap->a_op) { case F_SETLK: switch (fl->l_type) { case F_WRLCK: lkop = SMB_LOCK_EXCL; break; case F_RDLCK: lkop = SMB_LOCK_SHARED; break; case F_UNLCK: lkop = SMB_LOCK_RELEASE; break; default: return EINVAL; } error = lf_advlock(ap, &np->n_lockf, size); if (error) break; lkop = SMB_LOCK_EXCL; error = smbfs_smb_lock(np, lkop, id, start, end, &scred); if (error) { ap->a_op = F_UNLCK; lf_advlock(ap, &np->n_lockf, size); } break; case F_UNLCK: lf_advlock(ap, &np->n_lockf, size); error = smbfs_smb_lock(np, SMB_LOCK_RELEASE, id, start, end, &scred); break; case F_GETLK: error = lf_advlock(ap, &np->n_lockf, size); break; default: return EINVAL; } return error; } static int smbfs_pathcheck(struct smbmount *smp, const char *name, int nmlen, int nameiop) { static const char *badchars = "*/:<>;?"; static const char *badchars83 = " +|,[]="; const char *cp; int i, error; /* * Backslash characters, being a path delimiter, are prohibited * within a path component even for LOOKUP operations. */ if (index(name, '\\') != NULL) return ENOENT; if (nameiop == LOOKUP) return 0; error = ENOENT; if (SMB_DIALECT(SSTOVC(smp->sm_share)) < SMB_DIALECT_LANMAN2_0) { /* * Name should conform 8.3 format */ if (nmlen > 12) return ENAMETOOLONG; cp = index(name, '.'); if (cp == NULL) return error; if (cp == name || (cp - name) > 8) return error; cp = index(cp + 1, '.'); if (cp != NULL) return error; for (cp = name, i = 0; i < nmlen; i++, cp++) if (index(badchars83, *cp) != NULL) return error; } for (cp = name, i = 0; i < nmlen; i++, cp++) if (index(badchars, *cp) != NULL) return error; return 0; } /* * Things go even weird without fixed inode numbers... */ int smbfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct vnode *vp; struct smbmount *smp; struct mount *mp = dvp->v_mount; struct smbnode *dnp; struct smbfattr fattr, *fap; struct smb_cred scred; char *name = cnp->cn_nameptr; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; int nmlen = cnp->cn_namelen; int error, islastcn, isdot; int killit; SMBVDEBUG("\n"); if (dvp->v_type != VDIR) return ENOTDIR; if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) { SMBFSERR("invalid '..'\n"); return EIO; } #ifdef SMB_VNODE_DEBUG { char *cp, c; cp = name + nmlen; c = *cp; *cp = 0; SMBVDEBUG("%d '%s' in '%s' id=d\n", nameiop, name, VTOSMB(dvp)->n_name); *cp = c; } #endif islastcn = flags & ISLASTCN; if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP)) return EROFS; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) return error; smp = VFSTOSMBFS(mp); dnp = VTOSMB(dvp); isdot = (nmlen == 1 && name[0] == '.'); error = smbfs_pathcheck(smp, cnp->cn_nameptr, cnp->cn_namelen, nameiop); if (error) return ENOENT; error = cache_lookup(dvp, vpp, cnp); SMBVDEBUG("cache_lookup returned %d\n", error); if (error > 0) return error; if (error) { /* name was found */ struct vattr vattr; killit = 0; vp = *vpp; error = VOP_GETATTR(vp, &vattr, cnp->cn_cred, td); /* * If the file type on the server is inconsistent * with what it was when we created the vnode, * kill the bogus vnode now and fall through to * the code below to create a new one with the * right type. */ if (error == 0 && ((vp->v_type == VDIR && (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) == 0) || (vp->v_type == VREG && (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) != 0))) killit = 1; else if (error == 0 /* && vattr.va_ctime.tv_sec == VTOSMB(vp)->n_ctime*/) { if (nameiop != LOOKUP && islastcn) cnp->cn_flags |= SAVENAME; SMBVDEBUG("use cached vnode\n"); return (0); } cache_purge(vp); /* * XXX This is not quite right, if '.' is * inconsistent, we really need to start the lookup * all over again. Hopefully there is some other * guarantee that prevents this case from happening. */ if (killit && vp != dvp) vgone(vp); if (vp != dvp) vput(vp); else vrele(vp); *vpp = NULLVP; } /* * entry is not in the cache or has been expired */ error = 0; *vpp = NULLVP; smb_makescred(&scred, td, cnp->cn_cred); fap = &fattr; if (flags & ISDOTDOT) { error = smbfs_smb_lookup(VTOSMB(dnp->n_parent), NULL, 0, fap, &scred); SMBVDEBUG("result of dotdot lookup: %d\n", error); } else { fap = &fattr; error = smbfs_smb_lookup(dnp, name, nmlen, fap, &scred); /* if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')*/ SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error); } if (error && error != ENOENT) return error; if (error) { /* entry not found */ /* * Handle RENAME or CREATE case... */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return error; cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return ENOENT; }/* else { SMBVDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum); }*/ /* * handle DELETE case ... */ if (nameiop == DELETE && islastcn) { /* delete last component */ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return error; if (isdot) { VREF(dvp); *vpp = dvp; return 0; } error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) return error; *vpp = vp; cnp->cn_flags |= SAVENAME; return 0; } if (nameiop == RENAME && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return error; if (isdot) return EISDIR; error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) return error; *vpp = vp; cnp->cn_flags |= SAVENAME; return 0; } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); error = smbfs_nget(mp, dvp, name, nmlen, NULL, &vp); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (error) return error; *vpp = vp; } else if (isdot) { vref(dvp); *vpp = dvp; } else { error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) return error; *vpp = vp; SMBVDEBUG("lookup: getnewvp!\n"); } if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) { /* VTOSMB(*vpp)->n_ctime = VTOSMB(*vpp)->n_vattr.va_ctime.tv_sec;*/ cache_enter(dvp, *vpp, cnp); } return 0; } Index: head/sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- head/sys/fs/tmpfs/tmpfs_subr.c (revision 175201) +++ head/sys/fs/tmpfs/tmpfs_subr.c (revision 175202) @@ -1,1301 +1,1301 @@ /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ /* * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system supporting functions. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* --------------------------------------------------------------------- */ /* * Allocates a new node of type 'type' inside the 'tmp' mount point, with * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', * using the credentials of the process 'p'. * * If the node type is set to 'VDIR', then the parent parameter must point * to the parent directory of the node being created. It may only be NULL * while allocating the root node. * * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter * specifies the device the node represents. * * If the node type is set to 'VLNK', then the parameter target specifies * the file name of the target file for the symbolic link that is being * created. * * Note that new nodes are retrieved from the available list if it has * items or, if it is empty, from the node pool as long as there is enough * space to create them. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type, uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, char *target, dev_t rdev, struct thread *p, struct tmpfs_node **node) { struct tmpfs_node *nnode; /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); if (tmp->tm_nodes_inuse > tmp->tm_nodes_max) return (ENOSPC); nnode = (struct tmpfs_node *)uma_zalloc_arg( tmp->tm_node_pool, tmp, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; vfs_timestamp(&nnode->tn_atime); nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = nnode->tn_atime; nnode->tn_uid = uid; nnode->tn_gid = gid; nnode->tn_mode = mode; nnode->tn_id = alloc_unr(tmp->tm_ino_unr); /* Type-specific initialization. */ switch (nnode->tn_type) { case VBLK: case VCHR: nnode->tn_rdev = rdev; break; case VDIR: TAILQ_INIT(&nnode->tn_dir.tn_dirhead); MPASS(parent != nnode); MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; nnode->tn_dir.tn_readdir_lastn = 0; nnode->tn_dir.tn_readdir_lastp = NULL; nnode->tn_links++; nnode->tn_dir.tn_parent->tn_links++; break; case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: MPASS(strlen(target) < MAXPATHLEN); nnode->tn_size = strlen(target); nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME, M_WAITOK); memcpy(nnode->tn_link, target, nnode->tn_size); break; case VREG: nnode->tn_reg.tn_aobj = vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0); nnode->tn_reg.tn_aobj_pages = 0; break; default: panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type); } TMPFS_LOCK(tmp); LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); tmp->tm_nodes_inuse++; TMPFS_UNLOCK(tmp); *node = nnode; return 0; } /* --------------------------------------------------------------------- */ /* * Destroys the node pointed to by node from the file system 'tmp'. * If the node does not belong to the given mount point, the results are * unpredicted. * * If the node references a directory; no entries are allowed because * their removal could need a recursive algorithm, something forbidden in * kernel space. Furthermore, there is not need to provide such * functionality (recursive removal) because the only primitives offered * to the user are the removal of empty directories and the deletion of * individual files. * * Note that nodes are not really deleted; in fact, when a node has been * allocated, it cannot be deleted during the whole life of the file * system. Instead, they are moved to the available list and remain there * until reused. */ void tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) { size_t pages = 0; #ifdef INVARIANTS TMPFS_NODE_LOCK(node); MPASS(node->tn_vnode == NULL); TMPFS_NODE_UNLOCK(node); #endif TMPFS_LOCK(tmp); LIST_REMOVE(node, tn_entries); tmp->tm_nodes_inuse--; TMPFS_UNLOCK(tmp); switch (node->tn_type) { case VNON: /* Do not do anything. VNON is provided to let the * allocation routine clean itself easily by avoiding * duplicating code in it. */ /* FALLTHROUGH */ case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VDIR: /* FALLTHROUGH */ case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: free(node->tn_link, M_TMPFSNAME); break; case VREG: if (node->tn_reg.tn_aobj != NULL) vm_object_deallocate(node->tn_reg.tn_aobj); pages = node->tn_reg.tn_aobj_pages; break; default: panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } free_unr(tmp->tm_ino_unr, node->tn_id); uma_zfree(tmp->tm_node_pool, node); TMPFS_LOCK(tmp); tmp->tm_pages_used -= pages; TMPFS_UNLOCK(tmp); } /* --------------------------------------------------------------------- */ /* * Allocates a new directory entry for the node node with a name of name. * The new directory entry is returned in *de. * * The link count of node is increased by one to reflect the new object * referencing it. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, const char *name, uint16_t len, struct tmpfs_dirent **de) { struct tmpfs_dirent *nde; nde = (struct tmpfs_dirent *)uma_zalloc( tmp->tm_dirent_pool, M_WAITOK); nde->td_name = malloc(len, M_TMPFSNAME, M_WAITOK); nde->td_namelen = len; memcpy(nde->td_name, name, len); nde->td_node = node; node->tn_links++; *de = nde; return 0; } /* --------------------------------------------------------------------- */ /* * Frees a directory entry. It is the caller's responsibility to destroy * the node referenced by it if needed. * * The link count of node is decreased by one to reflect the removal of an * object that referenced it. This only happens if 'node_exists' is true; * otherwise the function will not access the node referred to by the * directory entry, as it may already have been released from the outside. */ void tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de, boolean_t node_exists) { if (node_exists) { struct tmpfs_node *node; node = de->td_node; MPASS(node->tn_links > 0); node->tn_links--; } free(de->td_name, M_TMPFSNAME); uma_zfree(tmp->tm_dirent_pool, de); } /* --------------------------------------------------------------------- */ /* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, struct vnode **vpp, struct thread *td) { int error = 0; struct vnode *vp; loop: TMPFS_NODE_LOCK(node); if ((vp = node->tn_vnode) != NULL) { VI_LOCK(vp); TMPFS_NODE_UNLOCK(node); vholdl(vp); (void) vget(vp, lkflag | LK_INTERLOCK | LK_RETRY, td); vdrop(vp); /* * Make sure the vnode is still there after * getting the interlock to avoid racing a free. */ if (node->tn_vnode == NULL || node->tn_vnode != vp) { vput(vp); goto loop; } goto out; } /* * otherwise lock the vp list while we call getnewvnode * since that can block. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = msleep((caddr_t) &node->tn_vpstate, TMPFS_NODE_MTX(node), PDROP | PCATCH, "tmpfs_alloc_vp", 0); if (error) return error; goto loop; } else node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* Get a new vnode and associate it with our node. */ error = getnewvnode("tmpfs", mp, &tmpfs_vnodeop_entries, &vp); if (error != 0) goto unlock; MPASS(vp != NULL); - (void) vn_lock(vp, lkflag | LK_RETRY, td); + (void) vn_lock(vp, lkflag | LK_RETRY); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VREG: /* FALLTHROUGH */ case VSOCK: break; case VFIFO: vp->v_op = &tmpfs_fifoop_entries; break; case VDIR: if (node->tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT; break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } vnode_pager_setsize(vp, node->tn_size); error = insmntque(vp, mp); if (error) { vgone(vp); vput(vp); vp = NULL; } unlock: TMPFS_NODE_LOCK(node); MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup((caddr_t) &node->tn_vpstate); } else TMPFS_NODE_UNLOCK(node); out: *vpp = vp; MPASS(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp, td))); #ifdef INVARIANTS TMPFS_NODE_LOCK(node); MPASS(*vpp == node->tn_vnode); TMPFS_NODE_UNLOCK(node); #endif return error; } /* --------------------------------------------------------------------- */ /* * Destroys the association between the vnode vp and the node it * references. */ void tmpfs_free_vp(struct vnode *vp) { struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); TMPFS_NODE_LOCK(node); node->tn_vnode = NULL; vp->v_data = NULL; TMPFS_NODE_UNLOCK(node); } /* --------------------------------------------------------------------- */ /* * Allocates a new file of type 'type' and adds it to the parent directory * 'dvp'; this addition is done using the component name given in 'cnp'. * The ownership of the new file is automatically assigned based on the * credentials of the caller (through 'cnp'), the group is set based on * the parent directory and the mode is determined from the 'vap' argument. * If successful, *vpp holds a vnode to the newly created file and zero * is returned. Otherwise *vpp is NULL and the function returns an * appropriate error code. */ int tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, struct componentname *cnp, char *target) { int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; struct tmpfs_node *parent; MPASS(VOP_ISLOCKED(dvp, cnp->cn_thread)); MPASS(cnp->cn_flags & HASBUF); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULL; /* If the entry we are creating is a directory, we cannot overflow * the number of links of its parent, because it will get a new * link. */ if (vap->va_type == VDIR) { /* Ensure that we do not overflow the maximum number of links * imposed by the system. */ MPASS(dnode->tn_links <= LINK_MAX); if (dnode->tn_links == LINK_MAX) { error = EMLINK; goto out; } parent = dnode; MPASS(parent != NULL); } else parent = NULL; /* Allocate a node that represents the new file. */ error = tmpfs_alloc_node(tmp, vap->va_type, cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev, cnp->cn_thread, &node); if (error != 0) goto out; /* Allocate a directory entry that points to the new file. */ error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) { tmpfs_free_node(tmp, node); goto out; } /* Allocate a vnode for the new file. */ error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp, cnp->cn_thread); if (error != 0) { tmpfs_free_dirent(tmp, de, TRUE); tmpfs_free_node(tmp, node); goto out; } /* Now that all required items are allocated, we can proceed to * insert the new node into the directory, an operation that * cannot fail. */ tmpfs_dir_attach(dvp, de); out: return error; } /* --------------------------------------------------------------------- */ /* * Attaches the directory entry de to the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_alloc_dirent. */ void tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; ASSERT_VOP_ELOCKED(vp, __func__); dnode = VP_TO_TMPFS_DIR(vp); TAILQ_INSERT_TAIL(&dnode->tn_dir.tn_dirhead, de, td_entries); dnode->tn_size += sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; } /* --------------------------------------------------------------------- */ /* * Detaches the directory entry de from the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_free_dirent. */ void tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; ASSERT_VOP_ELOCKED(vp, __func__); dnode = VP_TO_TMPFS_DIR(vp); if (dnode->tn_dir.tn_readdir_lastp == de) { dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; } TAILQ_REMOVE(&dnode->tn_dir.tn_dirhead, de, td_entries); dnode->tn_size -= sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; } /* --------------------------------------------------------------------- */ /* * Looks for a directory entry in the directory represented by node. * 'cnp' describes the name of the entry to look for. Note that the . * and .. components are not allowed as they do not physically exist * within directories. * * Returns a pointer to the entry when found, otherwise NULL. */ struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct componentname *cnp) { boolean_t found; struct tmpfs_dirent *de; MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.'))); TMPFS_VALIDATE_DIR(node); found = 0; TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { MPASS(cnp->cn_namelen < 0xffff); if (de->td_namelen == (uint16_t)cnp->cn_namelen && memcmp(de->td_name, cnp->cn_nameptr, de->td_namelen) == 0) { found = 1; break; } } node->tn_status |= TMPFS_NODE_ACCESSED; return found ? de : NULL; } struct tmpfs_dirent * tmpfs_dir_search(struct tmpfs_node *node, struct tmpfs_node *f) { struct tmpfs_dirent *de; TMPFS_VALIDATE_DIR(node); node->tn_status |= TMPFS_NODE_ACCESSED; TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { if (de->td_node == f) return (de); } return (NULL); } /* --------------------------------------------------------------------- */ /* * Helper function for tmpfs_readdir. Creates a '.' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ int tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); dent.d_fileno = node->tn_id; dent.d_type = DT_DIR; dent.d_namlen = 1; dent.d_name[0] = '.'; dent.d_name[1] = '\0'; dent.d_reclen = GENERIC_DIRSIZ(&dent); if (dent.d_reclen > uio->uio_resid) error = -1; else { error = uiomove(&dent, dent.d_reclen, uio); if (error == 0) uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; } node->tn_status |= TMPFS_NODE_ACCESSED; return error; } /* --------------------------------------------------------------------- */ /* * Helper function for tmpfs_readdir. Creates a '..' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ int tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); dent.d_fileno = node->tn_dir.tn_parent->tn_id; dent.d_type = DT_DIR; dent.d_namlen = 2; dent.d_name[0] = '.'; dent.d_name[1] = '.'; dent.d_name[2] = '\0'; dent.d_reclen = GENERIC_DIRSIZ(&dent); if (dent.d_reclen > uio->uio_resid) error = -1; else { error = uiomove(&dent, dent.d_reclen, uio); if (error == 0) { struct tmpfs_dirent *de; de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); if (de == NULL) uio->uio_offset = TMPFS_DIRCOOKIE_EOF; else uio->uio_offset = tmpfs_dircookie(de); } } node->tn_status |= TMPFS_NODE_ACCESSED; return error; } /* --------------------------------------------------------------------- */ /* * Lookup a directory entry by its associated cookie. */ struct tmpfs_dirent * tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie) { struct tmpfs_dirent *de; if (cookie == node->tn_dir.tn_readdir_lastn && node->tn_dir.tn_readdir_lastp != NULL) { return node->tn_dir.tn_readdir_lastp; } TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { if (tmpfs_dircookie(de) == cookie) { break; } } return de; } /* --------------------------------------------------------------------- */ /* * Helper function for tmpfs_readdir. Returns as much directory entries * as can fit in the uio space. The read starts at uio->uio_offset. * The function returns 0 on success, -1 if there was not enough space * in the uio structure to hold the directory entry or an appropriate * error code if another error happens. */ int tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp) { int error; off_t startcookie; struct tmpfs_dirent *de; TMPFS_VALIDATE_DIR(node); /* Locate the first directory entry we have to return. We have cached * the last readdir in the node, so use those values if appropriate. * Otherwise do a linear scan to find the requested entry. */ startcookie = uio->uio_offset; MPASS(startcookie != TMPFS_DIRCOOKIE_DOT); MPASS(startcookie != TMPFS_DIRCOOKIE_DOTDOT); if (startcookie == TMPFS_DIRCOOKIE_EOF) { return 0; } else { de = tmpfs_dir_lookupbycookie(node, startcookie); } if (de == NULL) { return EINVAL; } /* Read as much entries as possible; i.e., until we reach the end of * the directory or we exhaust uio space. */ do { struct dirent d; /* Create a dirent structure representing the current * tmpfs_node and fill it. */ d.d_fileno = de->td_node->tn_id; switch (de->td_node->tn_type) { case VBLK: d.d_type = DT_BLK; break; case VCHR: d.d_type = DT_CHR; break; case VDIR: d.d_type = DT_DIR; break; case VFIFO: d.d_type = DT_FIFO; break; case VLNK: d.d_type = DT_LNK; break; case VREG: d.d_type = DT_REG; break; case VSOCK: d.d_type = DT_SOCK; break; default: panic("tmpfs_dir_getdents: type %p %d", de->td_node, (int)de->td_node->tn_type); } d.d_namlen = de->td_namelen; MPASS(de->td_namelen < sizeof(d.d_name)); (void)memcpy(d.d_name, de->td_name, de->td_namelen); d.d_name[de->td_namelen] = '\0'; d.d_reclen = GENERIC_DIRSIZ(&d); /* Stop reading if the directory entry we are treating is * bigger than the amount of data that can be returned. */ if (d.d_reclen > uio->uio_resid) { error = -1; break; } /* Copy the new dirent structure into the output buffer and * advance pointers. */ error = uiomove(&d, d.d_reclen, uio); (*cntp)++; de = TAILQ_NEXT(de, td_entries); } while (error == 0 && uio->uio_resid > 0 && de != NULL); /* Update the offset and cache. */ if (de == NULL) { uio->uio_offset = TMPFS_DIRCOOKIE_EOF; node->tn_dir.tn_readdir_lastn = 0; node->tn_dir.tn_readdir_lastp = NULL; } else { node->tn_dir.tn_readdir_lastn = uio->uio_offset = tmpfs_dircookie(de); node->tn_dir.tn_readdir_lastp = de; } node->tn_status |= TMPFS_NODE_ACCESSED; return error; } /* --------------------------------------------------------------------- */ /* * Resizes the aobj associated to the regular file pointed to by vp to * the size newsize. 'vp' must point to a vnode that represents a regular * file. 'newsize' must be positive. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize) { int error; size_t newpages, oldpages; struct tmpfs_mount *tmp; struct tmpfs_node *node; off_t oldsize; MPASS(vp->v_type == VREG); MPASS(newsize >= 0); node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); /* Convert the old and new sizes to the number of pages needed to * store them. It may happen that we do not need to do anything * because the last allocated page can accommodate the change on * its own. */ oldsize = node->tn_size; oldpages = round_page(oldsize) / PAGE_SIZE; MPASS(oldpages == node->tn_reg.tn_aobj_pages); newpages = round_page(newsize) / PAGE_SIZE; if (newpages > oldpages && newpages - oldpages > TMPFS_PAGES_AVAIL(tmp)) { error = ENOSPC; goto out; } node->tn_reg.tn_aobj_pages = newpages; TMPFS_LOCK(tmp); tmp->tm_pages_used += (newpages - oldpages); TMPFS_UNLOCK(tmp); node->tn_size = newsize; vnode_pager_setsize(vp, newsize); if (newsize < oldsize) { size_t zerolen = round_page(newsize) - newsize; vm_object_t uobj = node->tn_reg.tn_aobj; vm_page_t m; /* * free "backing store" */ VM_OBJECT_LOCK(uobj); if (newpages < oldpages) { swap_pager_freespace(uobj, newpages, oldpages - newpages); vm_object_page_remove(uobj, OFF_TO_IDX(newsize + PAGE_MASK), 0, FALSE); } /* * zero out the truncated part of the last page. */ if (zerolen > 0) { m = vm_page_grab(uobj, OFF_TO_IDX(newsize), VM_ALLOC_NORMAL | VM_ALLOC_RETRY); pmap_zero_page_area(m, PAGE_SIZE - zerolen, zerolen); vm_page_wakeup(m); } VM_OBJECT_UNLOCK(uobj); } error = 0; out: return error; } /* --------------------------------------------------------------------- */ /* * Change flags of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chflags(struct vnode *vp, int flags, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, p)); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } /* Snapshot flag cannot be set or cleared */ if (((flags & SF_SNAPSHOT) != 0 && (node->tn_flags & SF_SNAPSHOT) == 0) || ((flags & SF_SNAPSHOT) == 0 && (node->tn_flags & SF_SNAPSHOT) != 0)) return (EPERM); node->tn_flags = flags; } else { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || (flags & UF_SETTABLE) != flags) return (EPERM); node->tn_flags &= SF_SETTABLE; node->tn_flags |= (flags & UF_SETTABLE); } node->tn_status |= TMPFS_NODE_CHANGED; MPASS(VOP_ISLOCKED(vp, p)); return 0; } /* --------------------------------------------------------------------- */ /* * Change access mode on the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, p)); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0)) return (EFTYPE); } if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID, 0); if (error) return (error); } node->tn_mode &= ~ALLPERMS; node->tn_mode |= mode & ALLPERMS; node->tn_status |= TMPFS_NODE_CHANGED; MPASS(VOP_ISLOCKED(vp, p)); return 0; } /* --------------------------------------------------------------------- */ /* * Change ownership of the given vnode. At least one of uid or gid must * be different than VNOVAL. If one is set to that value, the attribute * is unchanged. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; uid_t ouid; gid_t ogid; MPASS(VOP_ISLOCKED(vp, p)); node = VP_TO_TMPFS_NODE(vp); /* Assign default values if they are unknown. */ MPASS(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) uid = node->tn_uid; if (gid == VNOVAL) gid = node->tn_gid; MPASS(uid != VNOVAL && gid != VNOVAL); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if ((uid != node->tn_uid || (gid != node->tn_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0))) return (error); ogid = node->tn_gid; ouid = node->tn_uid; node->tn_uid = uid; node->tn_gid = gid; node->tn_status |= TMPFS_NODE_CHANGED; if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) node->tn_mode &= ~(S_ISUID | S_ISGID); } MPASS(VOP_ISLOCKED(vp, p)); return 0; } /* --------------------------------------------------------------------- */ /* * Change size of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, p)); node = VP_TO_TMPFS_NODE(vp); /* Decide whether this is a valid operation based on the file type. */ error = 0; switch (vp->v_type) { case VDIR: return EISDIR; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VFIFO: /* Allow modifications of special files even if in the file * system is mounted read-only (we are not modifying the * files themselves, but the objects they represent). */ return 0; default: /* Anything else is unsupported. */ return EOPNOTSUPP; } /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = tmpfs_truncate(vp, size); /* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents * for us, as will update tn_status; no need to do that here. */ MPASS(VOP_ISLOCKED(vp, p)); return error; } /* --------------------------------------------------------------------- */ /* * Change access and modification times of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chtimes(struct vnode *vp, struct timespec *atime, struct timespec *mtime, struct timespec *birthtime, int vaflags, struct ucred *cred, struct thread *l) { int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, l)); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* Determine if the user have proper privilege to update time. */ if (vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, cred, l); if (error) error = VOP_ACCESS(vp, VWRITE, cred, l); } else error = VOP_ACCESS(vp, VADMIN, cred, l); if (error) return (error); if (atime->tv_sec != VNOVAL && atime->tv_nsec != VNOVAL) node->tn_status |= TMPFS_NODE_ACCESSED; if (mtime->tv_sec != VNOVAL && mtime->tv_nsec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; if (birthtime->tv_nsec != VNOVAL && birthtime->tv_nsec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; tmpfs_itimes(vp, atime, mtime); if (birthtime->tv_nsec != VNOVAL && birthtime->tv_nsec != VNOVAL) node->tn_birthtime = *birthtime; MPASS(VOP_ISLOCKED(vp, l)); return 0; } /* --------------------------------------------------------------------- */ /* Sync timestamps */ void tmpfs_itimes(struct vnode *vp, const struct timespec *acc, const struct timespec *mod) { struct tmpfs_node *node; struct timespec now; node = VP_TO_TMPFS_NODE(vp); if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) return; vfs_timestamp(&now); if (node->tn_status & TMPFS_NODE_ACCESSED) { if (acc == NULL) acc = &now; node->tn_atime = *acc; } if (node->tn_status & TMPFS_NODE_MODIFIED) { if (mod == NULL) mod = &now; node->tn_mtime = *mod; } if (node->tn_status & TMPFS_NODE_CHANGED) { node->tn_ctime = now; } node->tn_status &= ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); } /* --------------------------------------------------------------------- */ void tmpfs_update(struct vnode *vp) { tmpfs_itimes(vp, NULL, NULL); } /* --------------------------------------------------------------------- */ int tmpfs_truncate(struct vnode *vp, off_t length) { boolean_t extended; int error; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); extended = length > node->tn_size; if (length < 0) { error = EINVAL; goto out; } if (node->tn_size == length) { error = 0; goto out; } if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); error = tmpfs_reg_resize(vp, length); if (error == 0) { node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; } out: tmpfs_update(vp); return error; } Index: head/sys/fs/tmpfs/tmpfs_vfsops.c =================================================================== --- head/sys/fs/tmpfs/tmpfs_vfsops.c (revision 175201) +++ head/sys/fs/tmpfs/tmpfs_vfsops.c (revision 175202) @@ -1,473 +1,473 @@ /* $NetBSD: tmpfs_vfsops.c,v 1.10 2005/12/11 12:24:29 christos Exp $ */ /* * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system. * * tmpfs is a file system that uses NetBSD's virtual memory sub-system * (the well-known UVM) to store file data and metadata in an efficient * way. This means that it does not follow the structure of an on-disk * file system because it simply does not need to. Instead, it uses * memory-specific data structures and algorithms to automatically * allocate and release resources. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include /* * Default permission for root node */ #define TMPFS_DEFAULT_ROOT_MODE (S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) MALLOC_DEFINE(M_TMPFSMNT, "tmpfs mount", "tmpfs mount structures"); MALLOC_DEFINE(M_TMPFSNAME, "tmpfs name", "tmpfs file names"); /* --------------------------------------------------------------------- */ static int tmpfs_mount(struct mount *, struct thread *); static int tmpfs_unmount(struct mount *, int, struct thread *); static int tmpfs_root(struct mount *, int flags, struct vnode **, struct thread *); static int tmpfs_fhtovp(struct mount *, struct fid *, struct vnode **); static int tmpfs_statfs(struct mount *, struct statfs *, struct thread *); /* --------------------------------------------------------------------- */ static const char *tmpfs_opts[] = { "from", "size", "inodes", "uid", "gid", "mode", "export", NULL }; /* --------------------------------------------------------------------- */ #define SWI_MAXMIB 3 static u_int get_swpgtotal(void) { struct xswdev xsd; char *sname = "vm.swap_info"; int soid[SWI_MAXMIB], oid[2]; u_int unswdev, total, dmmax, nswapdev; size_t mibi, len; total = 0; len = sizeof(dmmax); if (kernel_sysctlbyname(curthread, "vm.dmmax", &dmmax, &len, NULL, 0, NULL, 0) != 0) return total; len = sizeof(nswapdev); if (kernel_sysctlbyname(curthread, "vm.nswapdev", &nswapdev, &len, NULL, 0, NULL, 0) != 0) return total; mibi = (SWI_MAXMIB - 1) * sizeof(int); oid[0] = 0; oid[1] = 3; if (kernel_sysctl(curthread, oid, 2, soid, &mibi, (void *)sname, strlen(sname), NULL, 0) != 0) return total; mibi = (SWI_MAXMIB - 1); for (unswdev = 0; unswdev < nswapdev; ++unswdev) { soid[mibi] = unswdev; len = sizeof(struct xswdev); if (kernel_sysctl(curthread, soid, mibi + 1, &xsd, &len, NULL, 0, NULL, 0) != 0) return total; if (len == sizeof(struct xswdev)) total += (xsd.xsw_nblks - dmmax); } /* Not Reached */ return total; } /* --------------------------------------------------------------------- */ static int tmpfs_node_ctor(void *mem, int size, void *arg, int flags) { struct tmpfs_node *node = (struct tmpfs_node *)mem; node->tn_gen++; node->tn_size = 0; node->tn_status = 0; node->tn_flags = 0; node->tn_links = 0; node->tn_lockf = NULL; node->tn_vnode = NULL; node->tn_vpstate = 0; return (0); } static void tmpfs_node_dtor(void *mem, int size, void *arg) { struct tmpfs_node *node = (struct tmpfs_node *)mem; node->tn_type = VNON; } static int tmpfs_node_init(void *mem, int size, int flags) { struct tmpfs_node *node = (struct tmpfs_node *)mem; node->tn_id = 0; mtx_init(&node->tn_interlock, "tmpfs node interlock", NULL, MTX_DEF); node->tn_gen = arc4random(); return (0); } static void tmpfs_node_fini(void *mem, int size) { struct tmpfs_node *node = (struct tmpfs_node *)mem; mtx_destroy(&node->tn_interlock); } static int tmpfs_mount(struct mount *mp, struct thread *td) { struct tmpfs_mount *tmp; struct tmpfs_node *root; size_t pages, mem_size; ino_t nodes; int error; /* Size counters. */ ino_t nodes_max; size_t size_max; /* Root node attributes. */ uid_t root_uid; gid_t root_gid; mode_t root_mode; struct vattr va; if (vfs_filteropt(mp->mnt_optnew, tmpfs_opts)) return (EINVAL); if (mp->mnt_flag & MNT_UPDATE) { /* XXX: There is no support yet to update file system * settings. Should be added. */ return EOPNOTSUPP; } printf("WARNING: TMPFS is considered to be a highly experimental " "feature in FreeBSD.\n"); - vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY, td); + vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY); error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred, td); VOP_UNLOCK(mp->mnt_vnodecovered, 0, td); if (error) return (error); if (mp->mnt_cred->cr_ruid != 0 || vfs_scanopt(mp->mnt_optnew, "gid", "%d", &root_gid) != 1) root_gid = va.va_gid; if (mp->mnt_cred->cr_ruid != 0 || vfs_scanopt(mp->mnt_optnew, "uid", "%d", &root_uid) != 1) root_uid = va.va_uid; if (mp->mnt_cred->cr_ruid != 0 || vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1) root_mode = va.va_mode; if (vfs_scanopt(mp->mnt_optnew, "inodes", "%d", &nodes_max) != 1) nodes_max = 0; if (vfs_scanopt(mp->mnt_optnew, "size", "%qu", &size_max) != 1) size_max = 0; /* Do not allow mounts if we do not have enough memory to preserve * the minimum reserved pages. */ mem_size = cnt.v_free_count + cnt.v_inactive_count + get_swpgtotal(); mem_size -= mem_size > cnt.v_wire_count ? cnt.v_wire_count : mem_size; if (mem_size < TMPFS_PAGES_RESERVED) return ENOSPC; /* Get the maximum number of memory pages this file system is * allowed to use, based on the maximum size the user passed in * the mount structure. A value of zero is treated as if the * maximum available space was requested. */ if (size_max < PAGE_SIZE || size_max >= SIZE_MAX) pages = SIZE_MAX; else pages = howmany(size_max, PAGE_SIZE); MPASS(pages > 0); if (nodes_max <= 3) nodes = 3 + pages * PAGE_SIZE / 1024; else nodes = nodes_max; MPASS(nodes >= 3); /* Allocate the tmpfs mount structure and fill it. */ tmp = (struct tmpfs_mount *)malloc(sizeof(struct tmpfs_mount), M_TMPFSMNT, M_WAITOK | M_ZERO); mtx_init(&tmp->allnode_lock, "tmpfs allnode lock", NULL, MTX_DEF); tmp->tm_nodes_max = nodes; tmp->tm_nodes_inuse = 0; tmp->tm_maxfilesize = (u_int64_t)(cnt.v_page_count + get_swpgtotal()) * PAGE_SIZE; LIST_INIT(&tmp->tm_nodes_used); tmp->tm_pages_max = pages; tmp->tm_pages_used = 0; tmp->tm_ino_unr = new_unrhdr(2, INT_MAX, &tmp->allnode_lock); tmp->tm_dirent_pool = uma_zcreate("TMPFS dirent", sizeof(struct tmpfs_dirent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tmp->tm_node_pool = uma_zcreate("TMPFS node", sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); /* Allocate the root node. */ error = tmpfs_alloc_node(tmp, VDIR, root_uid, root_gid, root_mode & ALLPERMS, NULL, NULL, VNOVAL, td, &root); if (error != 0 || root == NULL) { uma_zdestroy(tmp->tm_node_pool); uma_zdestroy(tmp->tm_dirent_pool); delete_unrhdr(tmp->tm_ino_unr); free(tmp, M_TMPFSMNT); return error; } KASSERT(root->tn_id == 2, ("tmpfs root with invalid ino: %d", root->tn_id)); tmp->tm_root = root; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); mp->mnt_data = tmp; mp->mnt_stat.f_namemax = MAXNAMLEN; vfs_getnewfsid(mp); vfs_mountedfrom(mp, "tmpfs"); return 0; } /* --------------------------------------------------------------------- */ /* ARGSUSED2 */ static int tmpfs_unmount(struct mount *mp, int mntflags, struct thread *l) { int error; int flags = 0; struct tmpfs_mount *tmp; struct tmpfs_node *node; /* Handle forced unmounts. */ if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* Finalize all pending I/O. */ error = vflush(mp, 0, flags, l); if (error != 0) return error; tmp = VFS_TO_TMPFS(mp); /* Free all associated data. The loop iterates over the linked list * we have containing all used nodes. For each of them that is * a directory, we free all its directory entries. Note that after * freeing a node, it will automatically go to the available list, * so we will later have to iterate over it to release its items. */ node = LIST_FIRST(&tmp->tm_nodes_used); while (node != NULL) { struct tmpfs_node *next; if (node->tn_type == VDIR) { struct tmpfs_dirent *de; de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); while (de != NULL) { struct tmpfs_dirent *nde; nde = TAILQ_NEXT(de, td_entries); tmpfs_free_dirent(tmp, de, FALSE); de = nde; node->tn_size -= sizeof(struct tmpfs_dirent); } } next = LIST_NEXT(node, tn_entries); tmpfs_free_node(tmp, node); node = next; } uma_zdestroy(tmp->tm_dirent_pool); uma_zdestroy(tmp->tm_node_pool); delete_unrhdr(tmp->tm_ino_unr); mtx_destroy(&tmp->allnode_lock); MPASS(tmp->tm_pages_used == 0); MPASS(tmp->tm_nodes_inuse == 0); /* Throw away the tmpfs_mount structure. */ free(mp->mnt_data, M_TMPFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return 0; } /* --------------------------------------------------------------------- */ static int tmpfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { int error; error = tmpfs_alloc_vp(mp, VFS_TO_TMPFS(mp)->tm_root, flags, vpp, td); if (!error) (*vpp)->v_vflag |= VV_ROOT; return error; } /* --------------------------------------------------------------------- */ static int tmpfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) { boolean_t found; struct tmpfs_fid *tfhp; struct tmpfs_mount *tmp; struct tmpfs_node *node; tmp = VFS_TO_TMPFS(mp); tfhp = (struct tmpfs_fid *)fhp; if (tfhp->tf_len != sizeof(struct tmpfs_fid)) return EINVAL; if (tfhp->tf_id >= tmp->tm_nodes_max) return EINVAL; found = FALSE; TMPFS_LOCK(tmp); LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { if (node->tn_id == tfhp->tf_id && node->tn_gen == tfhp->tf_gen) { found = TRUE; break; } } TMPFS_UNLOCK(tmp); if (found) return (tmpfs_alloc_vp(mp, node, LK_EXCLUSIVE, vpp, curthread)); return (EINVAL); } /* --------------------------------------------------------------------- */ /* ARGSUSED2 */ static int tmpfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *l) { fsfilcnt_t freenodes; struct tmpfs_mount *tmp; tmp = VFS_TO_TMPFS(mp); sbp->f_iosize = PAGE_SIZE; sbp->f_bsize = PAGE_SIZE; sbp->f_blocks = TMPFS_PAGES_MAX(tmp); sbp->f_bavail = sbp->f_bfree = TMPFS_PAGES_AVAIL(tmp); freenodes = MIN(tmp->tm_nodes_max - tmp->tm_nodes_inuse, TMPFS_PAGES_AVAIL(tmp) * PAGE_SIZE / sizeof(struct tmpfs_node)); sbp->f_files = freenodes + tmp->tm_nodes_inuse; sbp->f_ffree = freenodes; /* sbp->f_owner = tmp->tn_uid; */ return 0; } /* --------------------------------------------------------------------- */ /* * tmpfs vfs operations. */ struct vfsops tmpfs_vfsops = { .vfs_mount = tmpfs_mount, .vfs_unmount = tmpfs_unmount, .vfs_root = tmpfs_root, .vfs_statfs = tmpfs_statfs, .vfs_fhtovp = tmpfs_fhtovp, }; VFS_SET(tmpfs_vfsops, tmpfs, 0); Index: head/sys/fs/tmpfs/tmpfs_vnops.c =================================================================== --- head/sys/fs/tmpfs/tmpfs_vnops.c (revision 175201) +++ head/sys/fs/tmpfs/tmpfs_vnops.c (revision 175202) @@ -1,1499 +1,1499 @@ /* $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $ */ /* * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* --------------------------------------------------------------------- */ static int tmpfs_lookup(struct vop_cachedlookup_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct thread *td = cnp->cn_thread; int error; struct tmpfs_dirent *de; struct tmpfs_node *dnode; dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULLVP; /* Check accessibility of requested node as a first step. */ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error != 0) goto out; /* We cannot be requesting the parent directory of the root node. */ MPASS(IMPLIES(dnode->tn_type == VDIR && dnode->tn_dir.tn_parent == dnode, !(cnp->cn_flags & ISDOTDOT))); if (cnp->cn_flags & ISDOTDOT) { int ltype = 0; ltype = VOP_ISLOCKED(dvp, td); vhold(dvp); VOP_UNLOCK(dvp, 0, td); /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent, cnp->cn_lkflags, vpp, td); - vn_lock(dvp, ltype | LK_RETRY, td); + vn_lock(dvp, ltype | LK_RETRY); vdrop(dvp); } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { VREF(dvp); *vpp = dvp; error = 0; } else { de = tmpfs_dir_lookup(dnode, cnp); if (de == NULL) { /* The entry was not found in the directory. * This is OK if we are creating or renaming an * entry and are working on the last component of * the path name. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE || \ cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Keep the component name in the buffer for * future uses. */ cnp->cn_flags |= SAVENAME; error = EJUSTRETURN; } else error = ENOENT; } else { struct tmpfs_node *tnode; /* The entry was found, so get its associated * tmpfs_node. */ tnode = de->td_node; /* If we are not at the last path component and * found a non-directory or non-link entry (which * may itself be pointing to a directory), raise * an error. */ if ((tnode->tn_type != VDIR && tnode->tn_type != VLNK) && !(cnp->cn_flags & ISLASTCN)) { error = ENOTDIR; goto out; } /* If we are deleting or renaming the entry, keep * track of its tmpfs_dirent so that it can be * easily deleted later. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp, td); if (error != 0) goto out; if ((dnode->tn_mode & S_ISTXT) && VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, cnp->cn_thread) && VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, cnp->cn_thread)) { error = EPERM; vput(*vpp); *vpp = NULL; goto out; } cnp->cn_flags |= SAVENAME; } else { error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp, td); } } } /* Store the result of this lookup in the cache. Avoid this if the * request was for creation, as it does not improve timings on * emprical tests. */ if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE) cache_enter(dvp, *vpp, cnp); out: /* If there were no errors, *vpp cannot be null and it must be * locked. */ MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp, td))); return error; } /* --------------------------------------------------------------------- */ static int tmpfs_create(struct vop_create_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; MPASS(vap->va_type == VREG || vap->va_type == VSOCK); return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); } /* --------------------------------------------------------------------- */ static int tmpfs_mknod(struct vop_mknod_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO) return EINVAL; return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); } /* --------------------------------------------------------------------- */ static int tmpfs_open(struct vop_open_args *v) { struct vnode *vp = v->a_vp; int mode = v->a_mode; int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, v->a_td)); node = VP_TO_TMPFS_NODE(vp); /* The file is still active but all its names have been removed * (e.g. by a "rmdir $(pwd)"). It cannot be opened any more as * it is about to die. */ if (node->tn_links < 1) return (ENOENT); /* If the file is marked append-only, deny write requests. */ if (node->tn_flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE) error = EPERM; else { error = 0; vnode_create_vobject(vp, node->tn_size, v->a_td); } MPASS(VOP_ISLOCKED(vp, v->a_td)); return error; } /* --------------------------------------------------------------------- */ static int tmpfs_close(struct vop_close_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, v->a_td)); node = VP_TO_TMPFS_NODE(vp); if (node->tn_links > 0) { /* Update node times. No need to do it if the node has * been deleted, because it will vanish after we return. */ tmpfs_update(vp); } return 0; } /* --------------------------------------------------------------------- */ int tmpfs_access(struct vop_access_args *v) { struct vnode *vp = v->a_vp; int mode = v->a_mode; struct ucred *cred = v->a_cred; int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, v->a_td)); node = VP_TO_TMPFS_NODE(vp); switch (vp->v_type) { case VDIR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VREG: if (mode & VWRITE && vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VSOCK: /* FALLTHROUGH */ case VFIFO: break; default: error = EINVAL; goto out; } if (mode & VWRITE && node->tn_flags & IMMUTABLE) { error = EPERM; goto out; } error = vaccess(vp->v_type, node->tn_mode, node->tn_uid, node->tn_gid, mode, cred, NULL); out: MPASS(VOP_ISLOCKED(vp, v->a_td)); return error; } /* --------------------------------------------------------------------- */ int tmpfs_getattr(struct vop_getattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); VATTR_NULL(vap); tmpfs_update(vp); vap->va_type = vp->v_type; vap->va_mode = node->tn_mode; vap->va_nlink = node->tn_links; vap->va_uid = node->tn_uid; vap->va_gid = node->tn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = node->tn_id; vap->va_size = node->tn_size; vap->va_blocksize = PAGE_SIZE; vap->va_atime = node->tn_atime; vap->va_mtime = node->tn_mtime; vap->va_ctime = node->tn_ctime; vap->va_birthtime = node->tn_birthtime; vap->va_gen = node->tn_gen; vap->va_flags = node->tn_flags; vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_rdev : VNOVAL; vap->va_bytes = round_page(node->tn_size); vap->va_filerev = VNOVAL; vap->va_vaflags = 0; vap->va_spare = VNOVAL; /* XXX */ return 0; } /* --------------------------------------------------------------------- */ /* XXX Should this operation be atomic? I think it should, but code in * XXX other places (e.g., ufs) doesn't seem to be... */ int tmpfs_setattr(struct vop_setattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; struct ucred *cred = v->a_cred; struct thread *l = v->a_td; int error; MPASS(VOP_ISLOCKED(vp, l)); error = 0; /* Abort if any unsettable attribute is given. */ if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL || vap->va_bytes != VNOVAL) error = EINVAL; if (error == 0 && (vap->va_flags != VNOVAL)) error = tmpfs_chflags(vp, vap->va_flags, cred, l); if (error == 0 && (vap->va_size != VNOVAL)) error = tmpfs_chsize(vp, vap->va_size, cred, l); if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL)) error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, l); if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) error = tmpfs_chmod(vp, vap->va_mode, cred, l); if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL && vap->va_atime.tv_nsec != VNOVAL) || (vap->va_mtime.tv_sec != VNOVAL && vap->va_mtime.tv_nsec != VNOVAL) || (vap->va_birthtime.tv_sec != VNOVAL && vap->va_birthtime.tv_nsec != VNOVAL))) error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime, &vap->va_birthtime, vap->va_vaflags, cred, l); /* Update the node times. We give preference to the error codes * generated by this function rather than the ones that may arise * from tmpfs_update. */ tmpfs_update(vp); MPASS(VOP_ISLOCKED(vp, l)); return error; } /* --------------------------------------------------------------------- */ static int tmpfs_mappedread(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio) { vm_pindex_t idx; vm_page_t m; struct sf_buf *sf; off_t offset, addr; size_t tlen; caddr_t va; int error; addr = uio->uio_offset; idx = OFF_TO_IDX(addr); offset = addr & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); if ((vobj == NULL) || (vobj->resident_page_count == 0)) goto nocache; VM_OBJECT_LOCK(vobj); lookupvpg: if (((m = vm_page_lookup(vobj, idx)) != NULL) && vm_page_is_valid(m, offset, tlen)) { if (vm_page_sleep_if_busy(m, FALSE, "tmfsmr")) goto lookupvpg; vm_page_busy(m); VM_OBJECT_UNLOCK(vobj); sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = uiomove(va + offset, tlen, uio); sf_buf_free(sf); sched_unpin(); VM_OBJECT_LOCK(vobj); vm_page_wakeup(m); VM_OBJECT_UNLOCK(vobj); return (error); } VM_OBJECT_UNLOCK(vobj); nocache: VM_OBJECT_LOCK(tobj); vm_object_pip_add(tobj, 1); m = vm_page_grab(tobj, idx, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { int behind, ahead; if (vm_pager_has_page(tobj, idx, &behind, &ahead)) { error = vm_pager_get_pages(tobj, &m, 1, 0); if (error != 0) { printf("tmpfs get pages from pager error [read]\n"); goto out; } } else vm_page_zero_invalid(m, TRUE); } VM_OBJECT_UNLOCK(tobj); sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = uiomove(va + offset, tlen, uio); sf_buf_free(sf); sched_unpin(); VM_OBJECT_LOCK(tobj); out: vm_page_lock_queues(); vm_page_unwire(m, 0); vm_page_activate(m); vm_page_unlock_queues(); vm_page_wakeup(m); vm_object_pip_subtract(tobj, 1); VM_OBJECT_UNLOCK(tobj); return (error); } static int tmpfs_read(struct vop_read_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; struct tmpfs_node *node; vm_object_t uobj; size_t len; int resid; int error = 0; node = VP_TO_TMPFS_NODE(vp); if (vp->v_type != VREG) { error = EISDIR; goto out; } if (uio->uio_offset < 0) { error = EINVAL; goto out; } node->tn_status |= TMPFS_NODE_ACCESSED; uobj = node->tn_reg.tn_aobj; while ((resid = uio->uio_resid) > 0) { error = 0; if (node->tn_size <= uio->uio_offset) break; len = MIN(node->tn_size - uio->uio_offset, resid); if (len == 0) break; error = tmpfs_mappedread(vp->v_object, uobj, len, uio); if ((error != 0) || (resid == uio->uio_resid)) break; } out: return error; } /* --------------------------------------------------------------------- */ static int tmpfs_mappedwrite(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio) { vm_pindex_t idx; vm_page_t vpg, tpg; struct sf_buf *sf; off_t offset, addr; size_t tlen; caddr_t va; int error; error = 0; addr = uio->uio_offset; idx = OFF_TO_IDX(addr); offset = addr & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); if ((vobj == NULL) || (vobj->resident_page_count == 0)) { vpg = NULL; goto nocache; } VM_OBJECT_LOCK(vobj); lookupvpg: if (((vpg = vm_page_lookup(vobj, idx)) != NULL) && vm_page_is_valid(vpg, offset, tlen)) { if (vm_page_sleep_if_busy(vpg, FALSE, "tmfsmw")) goto lookupvpg; vm_page_busy(vpg); vm_page_lock_queues(); vm_page_undirty(vpg); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(vobj); sched_pin(); sf = sf_buf_alloc(vpg, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = uiomove(va + offset, tlen, uio); sf_buf_free(sf); sched_unpin(); } else { VM_OBJECT_UNLOCK(vobj); vpg = NULL; } nocache: VM_OBJECT_LOCK(tobj); vm_object_pip_add(tobj, 1); tpg = vm_page_grab(tobj, idx, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (tpg->valid != VM_PAGE_BITS_ALL) { int behind, ahead; if (vm_pager_has_page(tobj, idx, &behind, &ahead)) { error = vm_pager_get_pages(tobj, &tpg, 1, 0); if (error != 0) { printf("tmpfs get pages from pager error [write]\n"); goto out; } } else vm_page_zero_invalid(tpg, TRUE); } VM_OBJECT_UNLOCK(tobj); if (vpg == NULL) { sched_pin(); sf = sf_buf_alloc(tpg, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = uiomove(va + offset, tlen, uio); sf_buf_free(sf); sched_unpin(); } else { KASSERT(vpg->valid == VM_PAGE_BITS_ALL, ("parts of vpg invalid")); pmap_copy_page(vpg, tpg); } VM_OBJECT_LOCK(tobj); out: if (vobj != NULL) VM_OBJECT_LOCK(vobj); vm_page_lock_queues(); if (error == 0) { vm_page_set_validclean(tpg, offset, tlen); vm_page_zero_invalid(tpg, TRUE); vm_page_dirty(tpg); } vm_page_unwire(tpg, 0); vm_page_activate(tpg); vm_page_unlock_queues(); vm_page_wakeup(tpg); if (vpg != NULL) vm_page_wakeup(vpg); if (vobj != NULL) VM_OBJECT_UNLOCK(vobj); vm_object_pip_subtract(tobj, 1); VM_OBJECT_UNLOCK(tobj); return (error); } static int tmpfs_write(struct vop_write_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; int ioflag = v->a_ioflag; struct thread *td = uio->uio_td; boolean_t extended; int error = 0; off_t oldsize; struct tmpfs_node *node; vm_object_t uobj; size_t len; int resid; node = VP_TO_TMPFS_NODE(vp); oldsize = node->tn_size; if (uio->uio_offset < 0 || vp->v_type != VREG) { error = EINVAL; goto out; } if (uio->uio_resid == 0) { error = 0; goto out; } if (ioflag & IO_APPEND) uio->uio_offset = node->tn_size; if (uio->uio_offset + uio->uio_resid > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); if (vp->v_type == VREG && td != NULL) { PROC_LOCK(td->td_proc); if (uio->uio_offset + uio->uio_resid > lim_cur(td->td_proc, RLIMIT_FSIZE)) { psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); } extended = uio->uio_offset + uio->uio_resid > node->tn_size; if (extended) { error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid); if (error != 0) goto out; } uobj = node->tn_reg.tn_aobj; while ((resid = uio->uio_resid) > 0) { if (node->tn_size <= uio->uio_offset) break; len = MIN(node->tn_size - uio->uio_offset, resid); if (len == 0) break; error = tmpfs_mappedwrite(vp->v_object, uobj, len, uio); if ((error != 0) || (resid == uio->uio_resid)) break; } node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | (extended ? TMPFS_NODE_CHANGED : 0); if (node->tn_mode & (S_ISUID | S_ISGID)) { if (priv_check_cred(v->a_cred, PRIV_VFS_RETAINSUGID, 0)) node->tn_mode &= ~(S_ISUID | S_ISGID); } if (error != 0) (void)tmpfs_reg_resize(vp, oldsize); out: MPASS(IMPLIES(error == 0, uio->uio_resid == 0)); MPASS(IMPLIES(error != 0, oldsize == node->tn_size)); return error; } /* --------------------------------------------------------------------- */ static int tmpfs_fsync(struct vop_fsync_args *v) { struct vnode *vp = v->a_vp; MPASS(VOP_ISLOCKED(vp, v->a_td)); tmpfs_update(vp); return 0; } /* --------------------------------------------------------------------- */ static int tmpfs_remove(struct vop_remove_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp, v->a_cnp->cn_thread)); MPASS(VOP_ISLOCKED(vp, v->a_cnp->cn_thread)); if (vp->v_type == VDIR) { error = EISDIR; goto out; } dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); de = tmpfs_dir_search(dnode, node); MPASS(de != NULL); /* Files marked as immutable or append-only cannot be deleted. */ if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) || (dnode->tn_flags & APPEND)) { error = EPERM; goto out; } /* Remove the entry from the directory; as it is a file, we do not * have to change the number of hard links of the directory. */ tmpfs_dir_detach(dvp, de); /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de, TRUE); if (node->tn_links > 0) node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; error = 0; out: return error; } /* --------------------------------------------------------------------- */ static int tmpfs_link(struct vop_link_args *v) { struct vnode *dvp = v->a_tdvp; struct vnode *vp = v->a_vp; struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp, cnp->cn_thread)); MPASS(cnp->cn_flags & HASBUF); MPASS(dvp != vp); /* XXX When can this be false? */ node = VP_TO_TMPFS_NODE(vp); /* XXX: Why aren't the following two tests done by the caller? */ /* Hard links of directories are forbidden. */ if (vp->v_type == VDIR) { error = EPERM; goto out; } /* Cannot create cross-device links. */ if (dvp->v_mount != vp->v_mount) { error = EXDEV; goto out; } /* Ensure that we do not overflow the maximum number of links imposed * by the system. */ MPASS(node->tn_links <= LINK_MAX); if (node->tn_links == LINK_MAX) { error = EMLINK; goto out; } /* We cannot create links of files marked immutable or append-only. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* Allocate a new directory entry to represent the node. */ error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) goto out; /* Insert the new directory entry into the appropriate directory. */ tmpfs_dir_attach(dvp, de); /* vp link count has changed, so update node times. */ node->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(vp); error = 0; out: return error; } /* --------------------------------------------------------------------- */ static int tmpfs_rename(struct vop_rename_args *v) { struct vnode *fdvp = v->a_fdvp; struct vnode *fvp = v->a_fvp; struct componentname *fcnp = v->a_fcnp; struct vnode *tdvp = v->a_tdvp; struct vnode *tvp = v->a_tvp; struct componentname *tcnp = v->a_tcnp; char *newname; int error; struct tmpfs_dirent *de; struct tmpfs_node *fdnode; struct tmpfs_node *fnode; struct tmpfs_node *tnode; struct tmpfs_node *tdnode; MPASS(VOP_ISLOCKED(tdvp, tcnp->cn_thread)); MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp, tcnp->cn_thread))); MPASS(fcnp->cn_flags & HASBUF); MPASS(tcnp->cn_flags & HASBUF); tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp); /* Disallow cross-device renames. * XXX Why isn't this done by the caller? */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULL && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto out; } tdnode = VP_TO_TMPFS_DIR(tdvp); /* If source and target are the same file, there is nothing to do. */ if (fvp == tvp) { error = 0; goto out; } /* If we need to move the directory between entries, lock the * source so that we can safely operate on it. */ if (tdvp != fdvp) { - error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, tcnp->cn_thread); + error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto out; } fdnode = VP_TO_TMPFS_DIR(fdvp); fnode = VP_TO_TMPFS_NODE(fvp); de = tmpfs_dir_search(fdnode, fnode); /* Avoid manipulating '.' and '..' entries. */ if (de == NULL) { MPASS(fvp->v_type == VDIR); error = EINVAL; goto out_locked; } MPASS(de->td_node == fnode); /* If re-naming a directory to another preexisting directory * ensure that the target directory is empty so that its * removal causes no side effects. * Kern_rename gurantees the destination to be a directory * if the source is one. */ if (tvp != NULL) { MPASS(tnode != NULL); if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (tdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) { if (tnode->tn_size > 0) { error = ENOTEMPTY; goto out_locked; } } else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) { error = ENOTDIR; goto out_locked; } else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) { error = EISDIR; goto out_locked; } else { MPASS(fnode->tn_type != VDIR && tnode->tn_type != VDIR); } } if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } /* Ensure that we have enough memory to hold the new name, if it * has to be changed. */ if (fcnp->cn_namelen != tcnp->cn_namelen || memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0) { newname = malloc(tcnp->cn_namelen, M_TMPFSNAME, M_WAITOK); } else newname = NULL; /* If the node is being moved to another directory, we have to do * the move. */ if (fdnode != tdnode) { /* In case we are moving a directory, we have to adjust its * parent to point to the new parent. */ if (de->td_node->tn_type == VDIR) { struct tmpfs_node *n; /* Ensure the target directory is not a child of the * directory being moved. Otherwise, we'd end up * with stale nodes. */ n = tdnode; while (n != n->tn_dir.tn_parent) { if (n == fnode) { error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } n = n->tn_dir.tn_parent; } /* Adjust the parent pointer. */ TMPFS_VALIDATE_DIR(fnode); de->td_node->tn_dir.tn_parent = tdnode; /* As a result of changing the target of the '..' * entry, the link count of the source and target * directories has to be adjusted. */ fdnode->tn_links--; tdnode->tn_links++; } /* Do the move: just remove the entry from the source directory * and insert it into the target one. */ tmpfs_dir_detach(fdvp, de); tmpfs_dir_attach(tdvp, de); } /* If the name has changed, we need to make it effective by changing * it in the directory entry. */ if (newname != NULL) { MPASS(tcnp->cn_namelen <= MAXNAMLEN); free(de->td_name, M_TMPFSNAME); de->td_namelen = (uint16_t)tcnp->cn_namelen; memcpy(newname, tcnp->cn_nameptr, tcnp->cn_namelen); de->td_name = newname; fnode->tn_status |= TMPFS_NODE_CHANGED; tdnode->tn_status |= TMPFS_NODE_MODIFIED; } /* If we are overwriting an entry, we have to remove the old one * from the target directory. */ if (tvp != NULL) { /* Remove the old entry from the target directory. */ de = tmpfs_dir_search(tdnode, tnode); tmpfs_dir_detach(tdvp, de); /* Free the directory entry we just deleted. Note that the * node referred by it will not be removed until the vnode is * really reclaimed. */ tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), de, TRUE); } error = 0; out_locked: if (fdnode != tdnode) VOP_UNLOCK(fdvp, 0, tcnp->cn_thread); out: /* Release target nodes. */ /* XXX: I don't understand when tdvp can be the same as tvp, but * other code takes care of this... */ if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp != NULL) vput(tvp); /* Release source nodes. */ vrele(fdvp); vrele(fvp); return error; } /* --------------------------------------------------------------------- */ static int tmpfs_mkdir(struct vop_mkdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; MPASS(vap->va_type == VDIR); return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); } /* --------------------------------------------------------------------- */ static int tmpfs_rmdir(struct vop_rmdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp, v->a_cnp->cn_thread)); MPASS(VOP_ISLOCKED(vp, v->a_cnp->cn_thread)); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_DIR(vp); /* Directories with more than two entries ('.' and '..') cannot be * removed. */ if (node->tn_size > 0) { error = ENOTEMPTY; goto out; } if ((dnode->tn_flags & APPEND) || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* This invariant holds only if we are not trying to remove "..". * We checked for that above so this is safe now. */ MPASS(node->tn_dir.tn_parent == dnode); /* Get the directory entry associated with node (vp). This was * filled by tmpfs_lookup while looking up the entry. */ de = tmpfs_dir_search(dnode, node); MPASS(TMPFS_DIRENT_MATCHES(de, v->a_cnp->cn_nameptr, v->a_cnp->cn_namelen)); /* Check flags to see if we are allowed to remove the directory. */ if (dnode->tn_flags & APPEND || node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* Detach the directory entry from the directory (dnode). */ tmpfs_dir_detach(dvp, de); node->tn_links--; node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; node->tn_dir.tn_parent->tn_links--; node->tn_dir.tn_parent->tn_status |= TMPFS_NODE_ACCESSED | \ TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; cache_purge(dvp); cache_purge(vp); /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de, TRUE); /* Release the deleted vnode (will destroy the node, notify * interested parties and clean it from the cache). */ dnode->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(dvp); error = 0; out: return error; } /* --------------------------------------------------------------------- */ static int tmpfs_symlink(struct vop_symlink_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; char *target = v->a_target; #ifdef notyet /* XXX FreeBSD BUG: kern_symlink is not setting VLNK */ MPASS(vap->va_type == VLNK); #else vap->va_type = VLNK; #endif return tmpfs_alloc_file(dvp, vpp, vap, cnp, target); } /* --------------------------------------------------------------------- */ static int tmpfs_readdir(struct vop_readdir_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; int *eofflag = v->a_eofflag; u_long **cookies = v->a_cookies; int *ncookies = v->a_ncookies; int error; off_t startoff; off_t cnt = 0; struct tmpfs_node *node; /* This operation only makes sense on directory nodes. */ if (vp->v_type != VDIR) return ENOTDIR; node = VP_TO_TMPFS_DIR(vp); startoff = uio->uio_offset; if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) { error = tmpfs_dir_getdotdent(node, uio); if (error != 0) goto outok; cnt++; } if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) { error = tmpfs_dir_getdotdotdent(node, uio); if (error != 0) goto outok; cnt++; } error = tmpfs_dir_getdents(node, uio, &cnt); outok: MPASS(error >= -1); if (error == -1) error = 0; if (eofflag != NULL) *eofflag = (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF); /* Update NFS-related variables. */ if (error == 0 && cookies != NULL && ncookies != NULL) { off_t i; off_t off = startoff; struct tmpfs_dirent *de = NULL; *ncookies = cnt; *cookies = malloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK); for (i = 0; i < cnt; i++) { MPASS(off != TMPFS_DIRCOOKIE_EOF); if (off == TMPFS_DIRCOOKIE_DOT) { off = TMPFS_DIRCOOKIE_DOTDOT; } else { if (off == TMPFS_DIRCOOKIE_DOTDOT) { de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); } else if (de != NULL) { de = TAILQ_NEXT(de, td_entries); } else { de = tmpfs_dir_lookupbycookie(node, off); MPASS(de != NULL); de = TAILQ_NEXT(de, td_entries); } if (de == NULL) off = TMPFS_DIRCOOKIE_EOF; else off = tmpfs_dircookie(de); } (*cookies)[i] = off; } MPASS(uio->uio_offset == off); } return error; } /* --------------------------------------------------------------------- */ static int tmpfs_readlink(struct vop_readlink_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; int error; struct tmpfs_node *node; MPASS(uio->uio_offset == 0); MPASS(vp->v_type == VLNK); node = VP_TO_TMPFS_NODE(vp); error = uiomove(node->tn_link, MIN(node->tn_size, uio->uio_resid), uio); node->tn_status |= TMPFS_NODE_ACCESSED; return error; } /* --------------------------------------------------------------------- */ static int tmpfs_inactive(struct vop_inactive_args *v) { struct vnode *vp = v->a_vp; struct thread *l = v->a_td; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp, l)); node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) vrecycle(vp, l); return 0; } /* --------------------------------------------------------------------- */ int tmpfs_reclaim(struct vop_reclaim_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_mount *tmp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); vnode_destroy_vobject(vp); cache_purge(vp); tmpfs_free_vp(vp); /* If the node referenced by this vnode was deleted by the user, * we must free its associated data structures (now that the vnode * is being reclaimed). */ if (node->tn_links == 0) tmpfs_free_node(tmp, node); MPASS(vp->v_data == NULL); return 0; } /* --------------------------------------------------------------------- */ static int tmpfs_print(struct vop_print_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n", node, node->tn_flags, node->tn_links); printf("\tmode 0%o, owner %d, group %d, size %" PRIdMAX ", status 0x%x\n", node->tn_mode, node->tn_uid, node->tn_gid, (uintmax_t)node->tn_size, node->tn_status); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return 0; } /* --------------------------------------------------------------------- */ static int tmpfs_pathconf(struct vop_pathconf_args *v) { int name = v->a_name; register_t *retval = v->a_retval; int error; error = 0; switch (name) { case _PC_LINK_MAX: *retval = LINK_MAX; break; case _PC_NAME_MAX: *retval = NAME_MAX; break; case _PC_PATH_MAX: *retval = PATH_MAX; break; case _PC_PIPE_BUF: *retval = PIPE_BUF; break; case _PC_CHOWN_RESTRICTED: *retval = 1; break; case _PC_NO_TRUNC: *retval = 1; break; case _PC_SYNC_IO: *retval = 1; break; case _PC_FILESIZEBITS: *retval = 0; /* XXX Don't know which value should I return. */ break; default: error = EINVAL; } return error; } /* --------------------------------------------------------------------- */ static int tmpfs_advlock(struct vop_advlock_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); return lf_advlock(v, &node->tn_lockf, node->tn_size); } /* --------------------------------------------------------------------- */ static int tmpfs_vptofh(struct vop_vptofh_args *ap) { struct tmpfs_fid *tfhp; struct tmpfs_node *node; tfhp = (struct tmpfs_fid *)ap->a_fhp; node = VP_TO_TMPFS_NODE(ap->a_vp); tfhp->tf_len = sizeof(struct tmpfs_fid); tfhp->tf_id = node->tn_id; tfhp->tf_gen = node->tn_gen; return (0); } /* --------------------------------------------------------------------- */ /* * vnode operations vector used for files stored in a tmpfs file system. */ struct vop_vector tmpfs_vnodeop_entries = { .vop_default = &default_vnodeops, .vop_lookup = vfs_cache_lookup, .vop_cachedlookup = tmpfs_lookup, .vop_create = tmpfs_create, .vop_mknod = tmpfs_mknod, .vop_open = tmpfs_open, .vop_close = tmpfs_close, .vop_access = tmpfs_access, .vop_getattr = tmpfs_getattr, .vop_setattr = tmpfs_setattr, .vop_read = tmpfs_read, .vop_write = tmpfs_write, .vop_fsync = tmpfs_fsync, .vop_remove = tmpfs_remove, .vop_link = tmpfs_link, .vop_rename = tmpfs_rename, .vop_mkdir = tmpfs_mkdir, .vop_rmdir = tmpfs_rmdir, .vop_symlink = tmpfs_symlink, .vop_readdir = tmpfs_readdir, .vop_readlink = tmpfs_readlink, .vop_inactive = tmpfs_inactive, .vop_reclaim = tmpfs_reclaim, .vop_print = tmpfs_print, .vop_pathconf = tmpfs_pathconf, .vop_advlock = tmpfs_advlock, .vop_vptofh = tmpfs_vptofh, .vop_bmap = VOP_EOPNOTSUPP, }; Index: head/sys/fs/udf/udf_vnops.c =================================================================== --- head/sys/fs/udf/udf_vnops.c (revision 175201) +++ head/sys/fs/udf/udf_vnops.c (revision 175202) @@ -1,1218 +1,1218 @@ /*- * Copyright (c) 2001, 2002 Scott Long * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* udf_vnops.c */ /* Take care of the vnode side of things */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct iconv_functions *udf_iconv; static vop_access_t udf_access; static vop_getattr_t udf_getattr; static vop_open_t udf_open; static vop_ioctl_t udf_ioctl; static vop_pathconf_t udf_pathconf; static vop_read_t udf_read; static vop_readdir_t udf_readdir; static vop_readlink_t udf_readlink; static vop_strategy_t udf_strategy; static vop_bmap_t udf_bmap; static vop_cachedlookup_t udf_lookup; static vop_reclaim_t udf_reclaim; static vop_vptofh_t udf_vptofh; static int udf_readatoffset(struct udf_node *node, int *size, off_t offset, struct buf **bp, uint8_t **data); static int udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector, uint32_t *max_size); static struct vop_vector udf_vnodeops = { .vop_default = &default_vnodeops, .vop_access = udf_access, .vop_bmap = udf_bmap, .vop_cachedlookup = udf_lookup, .vop_getattr = udf_getattr, .vop_ioctl = udf_ioctl, .vop_lookup = vfs_cache_lookup, .vop_open = udf_open, .vop_pathconf = udf_pathconf, .vop_read = udf_read, .vop_readdir = udf_readdir, .vop_readlink = udf_readlink, .vop_reclaim = udf_reclaim, .vop_strategy = udf_strategy, .vop_vptofh = udf_vptofh, }; MALLOC_DEFINE(M_UDFFID, "udf_fid", "UDF FileId structure"); MALLOC_DEFINE(M_UDFDS, "udf_ds", "UDF Dirstream structure"); #define UDF_INVALID_BMAP -1 int udf_allocv(struct mount *mp, struct vnode **vpp, struct thread *td) { int error; struct vnode *vp; error = getnewvnode("udf", mp, &udf_vnodeops, &vp); if (error) { printf("udf_allocv: failed to allocate new vnode\n"); return (error); } *vpp = vp; return (0); } /* Convert file entry permission (5 bits per owner/group/user) to a mode_t */ static mode_t udf_permtomode(struct udf_node *node) { uint32_t perm; uint16_t flags; mode_t mode; perm = le32toh(node->fentry->perm); flags = le16toh(node->fentry->icbtag.flags); mode = perm & UDF_FENTRY_PERM_USER_MASK; mode |= ((perm & UDF_FENTRY_PERM_GRP_MASK) >> 2); mode |= ((perm & UDF_FENTRY_PERM_OWNER_MASK) >> 4); mode |= ((flags & UDF_ICB_TAG_FLAGS_STICKY) << 4); mode |= ((flags & UDF_ICB_TAG_FLAGS_SETGID) << 6); mode |= ((flags & UDF_ICB_TAG_FLAGS_SETUID) << 8); return (mode); } static int udf_access(struct vop_access_args *a) { struct vnode *vp; struct udf_node *node; mode_t a_mode, mode; vp = a->a_vp; node = VTON(vp); a_mode = a->a_mode; if (a_mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } mode = udf_permtomode(node); return (vaccess(vp->v_type, mode, node->fentry->uid, node->fentry->gid, a_mode, a->a_cred, NULL)); } static int udf_open(struct vop_open_args *ap) { struct udf_node *np = VTON(ap->a_vp); off_t fsize; fsize = le64toh(np->fentry->inf_len); vnode_create_vobject(ap->a_vp, fsize, ap->a_td); return 0; } static int mon_lens[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31} }; static int udf_isaleapyear(int year) { int i; i = (year % 4) ? 0 : 1; i &= (year % 100) ? 1 : 0; i |= (year % 400) ? 0 : 1; return i; } /* * XXX This is just a rough hack. Daylight savings isn't calculated and tv_nsec * is ignored. * Timezone calculation compliments of Julian Elischer . */ static void udf_timetotimespec(struct timestamp *time, struct timespec *t) { int i, lpyear, daysinyear, year; union { uint16_t u_tz_offset; int16_t s_tz_offset; } tz; t->tv_nsec = 0; /* DirectCD seems to like using bogus year values */ year = le16toh(time->year); if (year < 1970) { t->tv_sec = 0; return; } /* Calculate the time and day */ t->tv_sec = time->second; t->tv_sec += time->minute * 60; t->tv_sec += time->hour * 3600; t->tv_sec += time->day * 3600 * 24; /* Calculate the month */ lpyear = udf_isaleapyear(year); for (i = 1; i < time->month; i++) t->tv_sec += mon_lens[lpyear][i] * 3600 * 24; /* Speed up the calculation */ if (year > 1979) t->tv_sec += 315532800; if (year > 1989) t->tv_sec += 315619200; if (year > 1999) t->tv_sec += 315532800; for (i = 2000; i < year; i++) { daysinyear = udf_isaleapyear(i) + 365 ; t->tv_sec += daysinyear * 3600 * 24; } /* * Calculate the time zone. The timezone is 12 bit signed 2's * complement, so we gotta do some extra magic to handle it right. */ tz.u_tz_offset = le16toh(time->type_tz); tz.u_tz_offset &= 0x0fff; if (tz.u_tz_offset & 0x0800) tz.u_tz_offset |= 0xf000; /* extend the sign to 16 bits */ if ((time->type_tz & 0x1000) && (tz.s_tz_offset != -2047)) t->tv_sec -= tz.s_tz_offset * 60; return; } static int udf_getattr(struct vop_getattr_args *a) { struct vnode *vp; struct udf_node *node; struct vattr *vap; struct file_entry *fentry; struct timespec ts; ts.tv_sec = 0; vp = a->a_vp; vap = a->a_vap; node = VTON(vp); fentry = node->fentry; vap->va_fsid = dev2udev(node->udfmp->im_dev); vap->va_fileid = node->hash_id; vap->va_mode = udf_permtomode(node); vap->va_nlink = le16toh(fentry->link_cnt); /* * XXX The spec says that -1 is valid for uid/gid and indicates an * invalid uid/gid. How should this be represented? */ vap->va_uid = (le32toh(fentry->uid) == -1) ? 0 : le32toh(fentry->uid); vap->va_gid = (le32toh(fentry->gid) == -1) ? 0 : le32toh(fentry->gid); udf_timetotimespec(&fentry->atime, &vap->va_atime); udf_timetotimespec(&fentry->mtime, &vap->va_mtime); vap->va_ctime = vap->va_mtime; /* XXX Stored as an Extended Attribute */ vap->va_rdev = 0; /* XXX */ if (vp->v_type & VDIR) { /* * Directories that are recorded within their ICB will show * as having 0 blocks recorded. Since tradition dictates * that directories consume at least one logical block, * make it appear so. */ if (fentry->logblks_rec != 0) { vap->va_size = le64toh(fentry->logblks_rec) * node->udfmp->bsize; } else { vap->va_size = node->udfmp->bsize; } } else { vap->va_size = le64toh(fentry->inf_len); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = node->udfmp->bsize; vap->va_bytes = le64toh(fentry->inf_len); vap->va_type = vp->v_type; vap->va_filerev = 0; /* XXX */ return (0); } /* * File specific ioctls. */ static int udf_ioctl(struct vop_ioctl_args *a) { printf("%s called\n", __func__); return (ENOTTY); } /* * I'm not sure that this has much value in a read-only filesystem, but * cd9660 has it too. */ static int udf_pathconf(struct vop_pathconf_args *a) { switch (a->a_name) { case _PC_LINK_MAX: *a->a_retval = 65535; return (0); case _PC_NAME_MAX: *a->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *a->a_retval = PATH_MAX; return (0); case _PC_NO_TRUNC: *a->a_retval = 1; return (0); default: return (EINVAL); } } #define lblkno(udfmp, loc) ((loc) >> (udfmp)->bshift) #define blkoff(udfmp, loc) ((loc) & (udfmp)->bmask) #define lblktosize(imp, blk) ((blk) << (udfmp)->bshift) static int udf_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct udf_node *node = VTON(vp); struct udf_mnt *udfmp; struct buf *bp; daddr_t lbn, rablock; off_t diff, fsize; int error = 0; long size, n, on; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); fsize = le64toh(node->fentry->inf_len); udfmp = node->udfmp; do { lbn = lblkno(udfmp, uio->uio_offset); on = blkoff(udfmp, uio->uio_offset); n = min((u_int)(udfmp->bsize - on), uio->uio_resid); diff = fsize - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = udfmp->bsize; rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(udfmp, rablock) < fsize) { error = cluster_read(vp, fsize, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), &bp); } else { error = bread(vp, lbn, size, NOCRED, &bp); } } else { error = bread(vp, lbn, size, NOCRED, &bp); } n = min(n, size - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Call the OSTA routines to translate the name from a CS0 dstring to a * 16-bit Unicode String. Hooks need to be placed in here to translate from * Unicode to the encoding that the kernel/user expects. Return the length * of the translated string. */ static int udf_transname(char *cs0string, char *destname, int len, struct udf_mnt *udfmp) { unicode_t *transname; char *unibuf, *unip; int i, destlen; ssize_t unilen = 0; size_t destleft = MAXNAMLEN; /* Convert 16-bit Unicode to destname */ if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) { /* allocate a buffer big enough to hold an 8->16 bit expansion */ unibuf = uma_zalloc(udf_zone_trans, M_WAITOK); unip = unibuf; if ((unilen = (ssize_t)udf_UncompressUnicodeByte(len, cs0string, unibuf)) == -1) { printf("udf: Unicode translation failed\n"); uma_zfree(udf_zone_trans, unibuf); return 0; } while (unilen > 0 && destleft > 0) { udf_iconv->conv(udfmp->im_d2l, (const char **)&unibuf, (size_t *)&unilen, (char **)&destname, &destleft); /* Unconverted character found */ if (unilen > 0 && destleft > 0) { *destname++ = '?'; destleft--; unibuf += 2; unilen -= 2; } } uma_zfree(udf_zone_trans, unip); *destname = '\0'; destlen = MAXNAMLEN - (int)destleft; } else { /* allocate a buffer big enough to hold an 8->16 bit expansion */ transname = uma_zalloc(udf_zone_trans, M_WAITOK); if ((unilen = (ssize_t)udf_UncompressUnicode(len, cs0string, transname)) == -1) { printf("udf: Unicode translation failed\n"); uma_zfree(udf_zone_trans, transname); return 0; } for (i = 0; i < unilen ; i++) { if (transname[i] & 0xff00) { destname[i] = '.'; /* Fudge the 16bit chars */ } else { destname[i] = transname[i] & 0xff; } } uma_zfree(udf_zone_trans, transname); destname[unilen] = 0; destlen = (int)unilen; } return (destlen); } /* * Compare a CS0 dstring with a name passed in from the VFS layer. Return * 0 on a successful match, nonzero otherwise. Unicode work may need to be done * here also. */ static int udf_cmpname(char *cs0string, char *cmpname, int cs0len, int cmplen, struct udf_mnt *udfmp) { char *transname; int error = 0; /* This is overkill, but not worth creating a new zone */ transname = uma_zalloc(udf_zone_trans, M_WAITOK); cs0len = udf_transname(cs0string, transname, cs0len, udfmp); /* Easy check. If they aren't the same length, they aren't equal */ if ((cs0len == 0) || (cs0len != cmplen)) error = -1; else error = bcmp(transname, cmpname, cmplen); uma_zfree(udf_zone_trans, transname); return (error); } struct udf_uiodir { struct dirent *dirent; u_long *cookies; int ncookies; int acookies; int eofflag; }; static int udf_uiodir(struct udf_uiodir *uiodir, int de_size, struct uio *uio, long cookie) { if (uiodir->cookies != NULL) { if (++uiodir->acookies > uiodir->ncookies) { uiodir->eofflag = 0; return (-1); } *uiodir->cookies++ = cookie; } if (uio->uio_resid < de_size) { uiodir->eofflag = 0; return (-1); } return (uiomove(uiodir->dirent, de_size, uio)); } static struct udf_dirstream * udf_opendir(struct udf_node *node, int offset, int fsize, struct udf_mnt *udfmp) { struct udf_dirstream *ds; ds = uma_zalloc(udf_zone_ds, M_WAITOK | M_ZERO); ds->node = node; ds->offset = offset; ds->udfmp = udfmp; ds->fsize = fsize; return (ds); } static struct fileid_desc * udf_getfid(struct udf_dirstream *ds) { struct fileid_desc *fid; int error, frag_size = 0, total_fid_size; /* End of directory? */ if (ds->offset + ds->off >= ds->fsize) { ds->error = 0; return (NULL); } /* Grab the first extent of the directory */ if (ds->off == 0) { ds->size = 0; error = udf_readatoffset(ds->node, &ds->size, ds->offset, &ds->bp, &ds->data); if (error) { ds->error = error; if (ds->bp != NULL) brelse(ds->bp); return (NULL); } } /* * Clean up from a previous fragmented FID. * XXX Is this the right place for this? */ if (ds->fid_fragment && ds->buf != NULL) { ds->fid_fragment = 0; FREE(ds->buf, M_UDFFID); } fid = (struct fileid_desc*)&ds->data[ds->off]; /* * Check to see if the fid is fragmented. The first test * ensures that we don't wander off the end of the buffer * looking for the l_iu and l_fi fields. */ if (ds->off + UDF_FID_SIZE > ds->size || ds->off + le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE > ds->size){ /* Copy what we have of the fid into a buffer */ frag_size = ds->size - ds->off; if (frag_size >= ds->udfmp->bsize) { printf("udf: invalid FID fragment\n"); ds->error = EINVAL; return (NULL); } /* * File ID descriptors can only be at most one * logical sector in size. */ MALLOC(ds->buf, uint8_t*, ds->udfmp->bsize, M_UDFFID, M_WAITOK | M_ZERO); bcopy(fid, ds->buf, frag_size); /* Reduce all of the casting magic */ fid = (struct fileid_desc*)ds->buf; if (ds->bp != NULL) brelse(ds->bp); /* Fetch the next allocation */ ds->offset += ds->size; ds->size = 0; error = udf_readatoffset(ds->node, &ds->size, ds->offset, &ds->bp, &ds->data); if (error) { ds->error = error; return (NULL); } /* * If the fragment was so small that we didn't get * the l_iu and l_fi fields, copy those in. */ if (frag_size < UDF_FID_SIZE) bcopy(ds->data, &ds->buf[frag_size], UDF_FID_SIZE - frag_size); /* * Now that we have enough of the fid to work with, * copy in the rest of the fid from the new * allocation. */ total_fid_size = UDF_FID_SIZE + le16toh(fid->l_iu) + fid->l_fi; if (total_fid_size > ds->udfmp->bsize) { printf("udf: invalid FID\n"); ds->error = EIO; return (NULL); } bcopy(ds->data, &ds->buf[frag_size], total_fid_size - frag_size); ds->fid_fragment = 1; } else { total_fid_size = le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE; } /* * Update the offset. Align on a 4 byte boundary because the * UDF spec says so. */ ds->this_off = ds->off; if (!ds->fid_fragment) { ds->off += (total_fid_size + 3) & ~0x03; } else { ds->off = (total_fid_size - frag_size + 3) & ~0x03; } return (fid); } static void udf_closedir(struct udf_dirstream *ds) { if (ds->bp != NULL) brelse(ds->bp); if (ds->fid_fragment && ds->buf != NULL) FREE(ds->buf, M_UDFFID); uma_zfree(udf_zone_ds, ds); } static int udf_readdir(struct vop_readdir_args *a) { struct vnode *vp; struct uio *uio; struct dirent dir; struct udf_node *node; struct udf_mnt *udfmp; struct fileid_desc *fid; struct udf_uiodir uiodir; struct udf_dirstream *ds; u_long *cookies = NULL; int ncookies; int error = 0; vp = a->a_vp; uio = a->a_uio; node = VTON(vp); udfmp = node->udfmp; uiodir.eofflag = 1; if (a->a_ncookies != NULL) { /* * Guess how many entries are needed. If we run out, this * function will be called again and thing will pick up were * it left off. */ ncookies = uio->uio_resid / 8; MALLOC(cookies, u_long *, sizeof(u_long) * ncookies, M_TEMP, M_WAITOK); if (cookies == NULL) return (ENOMEM); uiodir.ncookies = ncookies; uiodir.cookies = cookies; uiodir.acookies = 0; } else { uiodir.cookies = NULL; } /* * Iterate through the file id descriptors. Give the parent dir * entry special attention. */ ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len), node->udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ if (udf_checktag(&fid->tag, TAGID_FID)) { printf("Invalid FID tag\n"); hexdump(fid, UDF_FID_SIZE, NULL, 0); error = EIO; break; } /* Is this a deleted file? */ if (fid->file_char & UDF_FILE_CHAR_DEL) continue; if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) { /* Do up the '.' and '..' entries. Dummy values are * used for the cookies since the offset here is * usually zero, and NFS doesn't like that value */ dir.d_fileno = node->hash_id; dir.d_type = DT_DIR; dir.d_name[0] = '.'; dir.d_name[1] = '\0'; dir.d_namlen = 1; dir.d_reclen = GENERIC_DIRSIZ(&dir); uiodir.dirent = &dir; error = udf_uiodir(&uiodir, dir.d_reclen, uio, 1); if (error) break; dir.d_fileno = udf_getid(&fid->icb); dir.d_type = DT_DIR; dir.d_name[0] = '.'; dir.d_name[1] = '.'; dir.d_name[2] = '\0'; dir.d_namlen = 2; dir.d_reclen = GENERIC_DIRSIZ(&dir); uiodir.dirent = &dir; error = udf_uiodir(&uiodir, dir.d_reclen, uio, 2); } else { dir.d_namlen = udf_transname(&fid->data[fid->l_iu], &dir.d_name[0], fid->l_fi, udfmp); dir.d_fileno = udf_getid(&fid->icb); dir.d_type = (fid->file_char & UDF_FILE_CHAR_DIR) ? DT_DIR : DT_UNKNOWN; dir.d_reclen = GENERIC_DIRSIZ(&dir); uiodir.dirent = &dir; error = udf_uiodir(&uiodir, dir.d_reclen, uio, ds->this_off); } if (error) { printf("uiomove returned %d\n", error); break; } } /* tell the calling layer whether we need to be called again */ *a->a_eofflag = uiodir.eofflag; uio->uio_offset = ds->offset + ds->off; if (!error) error = ds->error; udf_closedir(ds); if (a->a_ncookies != NULL) { if (error) FREE(cookies, M_TEMP); else { *a->a_ncookies = uiodir.acookies; *a->a_cookies = cookies; } } return (error); } /* Are there any implementations out there that do soft-links? */ static int udf_readlink(struct vop_readlink_args *ap) { printf("%s called\n", __func__); return (EOPNOTSUPP); } static int udf_strategy(struct vop_strategy_args *a) { struct buf *bp; struct vnode *vp; struct udf_node *node; int maxsize; daddr_t sector; struct bufobj *bo; int multiplier; bp = a->a_bp; vp = a->a_vp; node = VTON(vp); if (bp->b_blkno == bp->b_lblkno) { /* * Files that are embedded in the fentry don't translate well * to a block number. Reject. */ if (udf_bmap_internal(node, bp->b_lblkno * node->udfmp->bsize, §or, &maxsize)) { clrbuf(bp); bp->b_blkno = -1; } /* bmap gives sector numbers, bio works with device blocks */ multiplier = node->udfmp->bsize / DEV_BSIZE; bp->b_blkno = sector * multiplier; } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bo = node->udfmp->im_bo; bp->b_iooffset = dbtob(bp->b_blkno); BO_STRATEGY(bo, bp); return (0); } static int udf_bmap(struct vop_bmap_args *a) { struct udf_node *node; uint32_t max_size; daddr_t lsector; int error; node = VTON(a->a_vp); if (a->a_bop != NULL) *a->a_bop = &node->udfmp->im_devvp->v_bufobj; if (a->a_bnp == NULL) return (0); if (a->a_runb) *a->a_runb = 0; error = udf_bmap_internal(node, a->a_bn * node->udfmp->bsize, &lsector, &max_size); if (error) return (error); /* Translate logical to physical sector number */ *a->a_bnp = lsector << (node->udfmp->bshift - DEV_BSHIFT); /* Punt on read-ahead for now */ if (a->a_runp) *a->a_runp = 0; return (0); } /* * The all powerful VOP_LOOKUP(). */ static int udf_lookup(struct vop_cachedlookup_args *a) { struct vnode *dvp; struct vnode *tdp = NULL; struct vnode **vpp = a->a_vpp; struct udf_node *node; struct udf_mnt *udfmp; struct fileid_desc *fid = NULL; struct udf_dirstream *ds; struct thread *td; u_long nameiop; u_long flags; char *nameptr; long namelen; ino_t id = 0; int offset, error = 0; int numdirpasses, fsize; dvp = a->a_dvp; node = VTON(dvp); udfmp = node->udfmp; nameiop = a->a_cnp->cn_nameiop; flags = a->a_cnp->cn_flags; nameptr = a->a_cnp->cn_nameptr; namelen = a->a_cnp->cn_namelen; fsize = le64toh(node->fentry->inf_len); td = a->a_cnp->cn_thread; /* * If this is a LOOKUP and we've already partially searched through * the directory, pick up where we left off and flag that the * directory may need to be searched twice. For a full description, * see /sys/fs/cd9660/cd9660_lookup.c:cd9660_lookup() */ if (nameiop != LOOKUP || node->diroff == 0 || node->diroff > fsize) { offset = 0; numdirpasses = 1; } else { offset = node->diroff; numdirpasses = 2; nchstats.ncs_2passes++; } lookloop: ds = udf_opendir(node, offset, fsize, udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ if (udf_checktag(&fid->tag, TAGID_FID)) { printf("udf_lookup: Invalid tag\n"); error = EIO; break; } /* Is this a deleted file? */ if (fid->file_char & UDF_FILE_CHAR_DEL) continue; if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) { if (flags & ISDOTDOT) { id = udf_getid(&fid->icb); break; } } else { if (!(udf_cmpname(&fid->data[fid->l_iu], nameptr, fid->l_fi, namelen, udfmp))) { id = udf_getid(&fid->icb); break; } } } if (!error) error = ds->error; /* XXX Bail out here? */ if (error) { udf_closedir(ds); return (error); } /* Did we have a match? */ if (id) { if (flags & ISDOTDOT) VOP_UNLOCK(dvp, 0, a->a_cnp->cn_thread); error = udf_vget(udfmp->im_mountp, id, LK_EXCLUSIVE, &tdp); if (flags & ISDOTDOT) - vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY, a->a_cnp->cn_thread); + vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY); if (!error) { /* * Remember where this entry was if it's the final * component. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) node->diroff = ds->offset + ds->off; if (numdirpasses == 2) nchstats.ncs_pass2++; *vpp = tdp; /* Put this entry in the cache */ if (flags & MAKEENTRY) cache_enter(dvp, *vpp, a->a_cnp); } } else { /* Name wasn't found on this pass. Do another pass? */ if (numdirpasses == 2) { numdirpasses--; offset = 0; udf_closedir(ds); goto lookloop; } /* Enter name into cache as non-existant */ if (flags & MAKEENTRY) cache_enter(dvp, *vpp, a->a_cnp); if ((flags & ISLASTCN) && (nameiop == CREATE || nameiop == RENAME)) { error = EROFS; } else { error = ENOENT; } } udf_closedir(ds); return (error); } static int udf_reclaim(struct vop_reclaim_args *a) { struct vnode *vp; struct udf_node *unode; vp = a->a_vp; unode = VTON(vp); /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); if (unode != NULL) { vfs_hash_remove(vp); if (unode->fentry != NULL) FREE(unode->fentry, M_UDFFENTRY); uma_zfree(udf_zone_node, unode); vp->v_data = NULL; } return (0); } static int udf_vptofh(struct vop_vptofh_args *a) { struct udf_node *node; struct ifid *ifhp; node = VTON(a->a_vp); ifhp = (struct ifid *)a->a_fhp; ifhp->ifid_len = sizeof(struct ifid); ifhp->ifid_ino = node->hash_id; return (0); } /* * Read the block and then set the data pointer to correspond with the * offset passed in. Only read in at most 'size' bytes, and then set 'size' * to the number of bytes pointed to. If 'size' is zero, try to read in a * whole extent. * * Note that *bp may be assigned error or not. * */ static int udf_readatoffset(struct udf_node *node, int *size, off_t offset, struct buf **bp, uint8_t **data) { struct udf_mnt *udfmp; struct file_entry *fentry = NULL; struct buf *bp1; uint32_t max_size; daddr_t sector; int error; udfmp = node->udfmp; *bp = NULL; error = udf_bmap_internal(node, offset, §or, &max_size); if (error == UDF_INVALID_BMAP) { /* * This error means that the file *data* is stored in the * allocation descriptor field of the file entry. */ fentry = node->fentry; *data = &fentry->data[le32toh(fentry->l_ea)]; *size = le32toh(fentry->l_ad); return (0); } else if (error != 0) { return (error); } /* Adjust the size so that it is within range */ if (*size == 0 || *size > max_size) *size = max_size; *size = min(*size, MAXBSIZE); if ((error = udf_readlblks(udfmp, sector, *size + (offset & udfmp->bmask), bp))) { printf("warning: udf_readlblks returned error %d\n", error); /* note: *bp may be non-NULL */ return (error); } bp1 = *bp; *data = (uint8_t *)&bp1->b_data[offset & udfmp->bmask]; return (0); } /* * Translate a file offset into a logical block and then into a physical * block. * max_size - maximum number of bytes that can be read starting from given * offset, rather than beginning of calculated sector number */ static int udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector, uint32_t *max_size) { struct udf_mnt *udfmp; struct file_entry *fentry; void *icb; struct icb_tag *tag; uint32_t icblen = 0; daddr_t lsector; int ad_offset, ad_num = 0; int i, p_offset; udfmp = node->udfmp; fentry = node->fentry; tag = &fentry->icbtag; switch (le16toh(tag->strat_type)) { case 4: break; case 4096: printf("Cannot deal with strategy4096 yet!\n"); return (ENODEV); default: printf("Unknown strategy type %d\n", tag->strat_type); return (ENODEV); } switch (le16toh(tag->flags) & 0x7) { case 0: /* * The allocation descriptor field is filled with short_ad's. * If the offset is beyond the current extent, look for the * next extent. */ do { offset -= icblen; ad_offset = sizeof(struct short_ad) * ad_num; if (ad_offset > le32toh(fentry->l_ad)) { printf("File offset out of bounds\n"); return (EINVAL); } icb = GETICB(short_ad, fentry, le32toh(fentry->l_ea) + ad_offset); icblen = GETICBLEN(short_ad, icb); ad_num++; } while(offset >= icblen); lsector = (offset >> udfmp->bshift) + le32toh(((struct short_ad *)(icb))->pos); *max_size = icblen - offset; break; case 1: /* * The allocation descriptor field is filled with long_ad's * If the offset is beyond the current extent, look for the * next extent. */ do { offset -= icblen; ad_offset = sizeof(struct long_ad) * ad_num; if (ad_offset > le32toh(fentry->l_ad)) { printf("File offset out of bounds\n"); return (EINVAL); } icb = GETICB(long_ad, fentry, le32toh(fentry->l_ea) + ad_offset); icblen = GETICBLEN(long_ad, icb); ad_num++; } while(offset >= icblen); lsector = (offset >> udfmp->bshift) + le32toh(((struct long_ad *)(icb))->loc.lb_num); *max_size = icblen - offset; break; case 3: /* * This type means that the file *data* is stored in the * allocation descriptor field of the file entry. */ *max_size = 0; *sector = node->hash_id + udfmp->part_start; return (UDF_INVALID_BMAP); case 2: /* DirectCD does not use extended_ad's */ default: printf("Unsupported allocation descriptor %d\n", tag->flags & 0x7); return (ENODEV); } *sector = lsector + udfmp->part_start; /* * Check the sparing table. Each entry represents the beginning of * a packet. */ if (udfmp->s_table != NULL) { for (i = 0; i< udfmp->s_table_entries; i++) { p_offset = lsector - le32toh(udfmp->s_table->entries[i].org); if ((p_offset < udfmp->p_sectors) && (p_offset >= 0)) { *sector = le32toh(udfmp->s_table->entries[i].map) + p_offset; break; } } } return (0); } Index: head/sys/fs/unionfs/union_subr.c =================================================================== --- head/sys/fs/unionfs/union_subr.c (revision 175201) +++ head/sys/fs/unionfs/union_subr.c (revision 175202) @@ -1,1093 +1,1093 @@ /*- * Copyright (c) 1994 Jan-Simon Pendry * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * Copyright (c) 2005, 2006 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006 Daichi Goto * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC #include #endif #include #include MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part"); MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part"); /* * Initialize */ int unionfs_init(struct vfsconf *vfsp) { UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */ return (0); } /* * Uninitialize */ int unionfs_uninit(struct vfsconf *vfsp) { return (0); } /* * Make a new or get existing unionfs node. * * uppervp and lowervp should be unlocked. Because if new unionfs vnode is * locked, uppervp or lowervp is locked too. In order to prevent dead lock, * you should not lock plurality simultaneously. */ int unionfs_nodeget(struct mount *mp, struct vnode *uppervp, struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct thread *td) { struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *vp; int error; int lkflags; char *path; ump = MOUNTTOUNIONFSMOUNT(mp); lkflags = (cnp ? cnp->cn_lkflags : 0); path = (cnp ? cnp->cn_nameptr : NULL); if (uppervp == NULLVP && lowervp == NULLVP) panic("unionfs_nodeget: upper and lower is null"); /* If it has no ISLASTCN flag, path check is skipped. */ if (cnp && !(cnp->cn_flags & ISLASTCN)) path = NULL; if ((uppervp == NULLVP || ump->um_uppervp != uppervp) || (lowervp == NULLVP || ump->um_lowervp != lowervp)) { if (dvp == NULLVP) return (EINVAL); } /* * Do the MALLOC before the getnewvnode since doing so afterward * might cause a bogus v_data pointer to get dereferenced elsewhere * if MALLOC should block. */ MALLOC(unp, struct unionfs_node *, sizeof(struct unionfs_node), M_UNIONFSNODE, M_WAITOK | M_ZERO); error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp); if (error != 0) { FREE(unp, M_UNIONFSNODE); return (error); } error = insmntque(vp, mp); /* XXX: Too early for mpsafe fs */ if (error != 0) { FREE(unp, M_UNIONFSNODE); return (error); } if (dvp != NULLVP) vref(dvp); if (uppervp != NULLVP) vref(uppervp); if (lowervp != NULLVP) vref(lowervp); unp->un_vnode = vp; unp->un_uppervp = uppervp; unp->un_lowervp = lowervp; unp->un_dvp = dvp; if (uppervp != NULLVP) vp->v_vnlock = uppervp->v_vnlock; else vp->v_vnlock = lowervp->v_vnlock; if (path != NULL) { unp->un_path = (char *) malloc(cnp->cn_namelen +1, M_UNIONFSPATH, M_WAITOK|M_ZERO); bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen); unp->un_path[cnp->cn_namelen] = '\0'; } vp->v_type = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type); vp->v_data = unp; if ((uppervp != NULLVP && ump->um_uppervp == uppervp) && (lowervp != NULLVP && ump->um_lowervp == lowervp)) vp->v_vflag |= VV_ROOT; if (lkflags & LK_TYPE_MASK) - vn_lock(vp, lkflags | LK_RETRY, td); + vn_lock(vp, lkflags | LK_RETRY); *vpp = vp; return (0); } /* * Clean up the unionfs node. */ void unionfs_noderem(struct vnode *vp, struct thread *td) { int vfslocked; struct unionfs_node *unp; struct unionfs_node_status *unsp, *unsp_tmp; struct vnode *lvp; struct vnode *uvp; /* * Use the interlock to protect the clearing of v_data to * prevent faults in unionfs_lock(). */ VI_LOCK(vp); unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; unp->un_lowervp = unp->un_uppervp = NULLVP; vp->v_vnlock = &(vp->v_lock); vp->v_data = NULL; lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(vp), td); if (lvp != NULLVP) VOP_UNLOCK(lvp, 0, td); if (uvp != NULLVP) VOP_UNLOCK(uvp, 0, td); vp->v_object = NULL; if (lvp != NULLVP) { vfslocked = VFS_LOCK_GIANT(lvp->v_mount); vrele(lvp); VFS_UNLOCK_GIANT(vfslocked); } if (uvp != NULLVP) { vfslocked = VFS_LOCK_GIANT(uvp->v_mount); vrele(uvp); VFS_UNLOCK_GIANT(vfslocked); } if (unp->un_dvp != NULLVP) { vfslocked = VFS_LOCK_GIANT(unp->un_dvp->v_mount); vrele(unp->un_dvp); VFS_UNLOCK_GIANT(vfslocked); unp->un_dvp = NULLVP; } if (unp->un_path) { free(unp->un_path, M_UNIONFSPATH); unp->un_path = NULL; } LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) { LIST_REMOVE(unsp, uns_list); free(unsp, M_TEMP); } FREE(unp, M_UNIONFSNODE); } /* * Get the unionfs node status. * You need exclusive lock this vnode. */ void unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, struct unionfs_node_status **unspp) { struct unionfs_node_status *unsp; KASSERT(NULL != unspp, ("null pointer")); ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), "unionfs_get_node_status"); LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) { if (unsp->uns_tid == td->td_tid) { *unspp = unsp; return; } } /* create a new unionfs node status */ MALLOC(unsp, struct unionfs_node_status *, sizeof(struct unionfs_node_status), M_TEMP, M_WAITOK | M_ZERO); unsp->uns_tid = td->td_tid; LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); *unspp = unsp; } /* * Remove the unionfs node status, if you can. * You need exclusive lock this vnode. */ void unionfs_tryrem_node_status(struct unionfs_node *unp, struct thread *td, struct unionfs_node_status *unsp) { KASSERT(NULL != unsp, ("null pointer")); ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), "unionfs_get_node_status"); if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt) return; LIST_REMOVE(unsp, uns_list); free(unsp, M_TEMP); } /* * Create upper node attr. */ void unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva, struct vattr *uva, struct thread *td) { VATTR_NULL(uva); uva->va_type = lva->va_type; uva->va_atime = lva->va_atime; uva->va_mtime = lva->va_mtime; uva->va_ctime = lva->va_ctime; switch (ump->um_copymode) { case UNIONFS_TRANSPARENT: uva->va_mode = lva->va_mode; uva->va_uid = lva->va_uid; uva->va_gid = lva->va_gid; break; case UNIONFS_MASQUERADE: if (ump->um_uid == lva->va_uid) { uva->va_mode = lva->va_mode & 077077; uva->va_mode |= (lva->va_type == VDIR ? ump->um_udir : ump->um_ufile) & 0700; uva->va_uid = lva->va_uid; uva->va_gid = lva->va_gid; } else { uva->va_mode = (lva->va_type == VDIR ? ump->um_udir : ump->um_ufile); uva->va_uid = ump->um_uid; uva->va_gid = ump->um_gid; } break; default: /* UNIONFS_TRADITIONAL */ FILEDESC_SLOCK(td->td_proc->p_fd); uva->va_mode = 0777 & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); uva->va_uid = ump->um_uid; uva->va_gid = ump->um_gid; break; } } /* * Create upper node attr. */ int unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp, struct vattr *uva, struct ucred *cred, struct thread *td) { int error; struct vattr lva; if ((error = VOP_GETATTR(lvp, &lva, cred, td))) return (error); unionfs_create_uppervattr_core(ump, &lva, uva, td); return (error); } /* * relookup * * dvp should be locked on entry and will be locked on return. * * If an error is returned, *vpp will be invalid, otherwise it will hold a * locked, referenced vnode. If *vpp == dvp then remember that only one * LK_EXCLUSIVE lock is held. */ static int unionfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct componentname *cn, struct thread *td, char *path, int pathlen, u_long nameiop) { int error; cn->cn_namelen = pathlen; cn->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); bcopy(path, cn->cn_pnbuf, pathlen); cn->cn_pnbuf[pathlen] = '\0'; cn->cn_nameiop = nameiop; cn->cn_flags = (LOCKPARENT | LOCKLEAF | HASBUF | SAVENAME | ISLASTCN); cn->cn_lkflags = LK_EXCLUSIVE; cn->cn_thread = td; cn->cn_cred = cnp->cn_cred; cn->cn_nameptr = cn->cn_pnbuf; cn->cn_consume = cnp->cn_consume; if (nameiop == DELETE) cn->cn_flags |= (cnp->cn_flags & (DOWHITEOUT | SAVESTART)); else if (RENAME == nameiop) cn->cn_flags |= (cnp->cn_flags & SAVESTART); vref(dvp); VOP_UNLOCK(dvp, 0, td); if ((error = relookup(dvp, vpp, cn))) { uma_zfree(namei_zone, cn->cn_pnbuf); cn->cn_flags &= ~HASBUF; - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } else vrele(dvp); return (error); } /* * relookup for CREATE namei operation. * * dvp is unionfs vnode. dvp should be locked. * * If it called 'unionfs_copyfile' function by unionfs_link etc, * VOP_LOOKUP information is broken. * So it need relookup in order to create link etc. */ int unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp, struct thread *td) { int error; struct vnode *udvp; struct vnode *vp; struct componentname cn; udvp = UNIONFSVPTOUPPERVP(dvp); vp = NULLVP; error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, strlen(cnp->cn_nameptr), CREATE); if (error) return (error); if (vp != NULLVP) { if (udvp == vp) vrele(vp); else vput(vp); error = EEXIST; } if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (!error) { cn.cn_flags |= (cnp->cn_flags & HASBUF); cnp->cn_flags = cn.cn_flags; } return (error); } /* * relookup for DELETE namei operation. * * dvp is unionfs vnode. dvp should be locked. */ int unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp, struct thread *td) { int error; struct vnode *udvp; struct vnode *vp; struct componentname cn; udvp = UNIONFSVPTOUPPERVP(dvp); vp = NULLVP; error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, strlen(cnp->cn_nameptr), DELETE); if (error) return (error); if (vp == NULLVP) error = ENOENT; else { if (udvp == vp) vrele(vp); else vput(vp); } if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (!error) { cn.cn_flags |= (cnp->cn_flags & HASBUF); cnp->cn_flags = cn.cn_flags; } return (error); } /* * relookup for RENAME namei operation. * * dvp is unionfs vnode. dvp should be locked. */ int unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp, struct thread *td) { int error; struct vnode *udvp; struct vnode *vp; struct componentname cn; udvp = UNIONFSVPTOUPPERVP(dvp); vp = NULLVP; error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, strlen(cnp->cn_nameptr), RENAME); if (error) return (error); if (vp != NULLVP) { if (udvp == vp) vrele(vp); else vput(vp); } if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (!error) { cn.cn_flags |= (cnp->cn_flags & HASBUF); cnp->cn_flags = cn.cn_flags; } return (error); } /* * Update the unionfs_node. * * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the * uvp's lock and lower's lock will be unlocked. */ static void unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, struct thread *td) { int count, lockcnt; struct vnode *vp; struct vnode *lvp; vp = UNIONFSTOV(unp); lvp = unp->un_lowervp; /* * lock update */ VI_LOCK(vp); unp->un_uppervp = uvp; vp->v_vnlock = uvp->v_vnlock; lockcnt = lvp->v_vnlock->lk_exclusivecount; if (lockcnt <= 0) panic("unionfs: no exclusive lock"); VI_UNLOCK(vp); for (count = 1; count < lockcnt; count++) - vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY, td); + vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY); } /* * Create a new shadow dir. * * udvp should be locked on entry and will be locked on return. * * If no error returned, unp will be updated. */ int unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, struct unionfs_node *unp, struct componentname *cnp, struct thread *td) { int error; struct vnode *lvp; struct vnode *uvp; struct vattr va; struct vattr lva; struct componentname cn; struct mount *mp; struct ucred *cred; struct ucred *credbk; struct uidinfo *rootinfo; if (unp->un_uppervp != NULLVP) return (EEXIST); lvp = unp->un_lowervp; uvp = NULLVP; credbk = cnp->cn_cred; /* Authority change to root */ rootinfo = uifind((uid_t)0); cred = crdup(cnp->cn_cred); chgproccnt(cred->cr_ruidinfo, 1, 0); change_euid(cred, rootinfo); change_ruid(cred, rootinfo); change_svuid(cred, (uid_t)0); uifree(rootinfo); cnp->cn_cred = cred; memset(&cn, 0, sizeof(cn)); if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred, td))) goto unionfs_mkshadowdir_abort; if ((error = unionfs_relookup(udvp, &uvp, cnp, &cn, td, cnp->cn_nameptr, cnp->cn_namelen, CREATE))) goto unionfs_mkshadowdir_abort; if (uvp != NULLVP) { if (udvp == uvp) vrele(uvp); else vput(uvp); error = EEXIST; goto unionfs_mkshadowdir_free_out; } if ((error = vn_start_write(udvp, &mp, V_WAIT | PCATCH))) goto unionfs_mkshadowdir_free_out; if ((error = VOP_LEASE(udvp, td, cn.cn_cred, LEASE_WRITE))) { vn_finished_write(mp); goto unionfs_mkshadowdir_free_out; } unionfs_create_uppervattr_core(ump, &lva, &va, td); error = VOP_MKDIR(udvp, &uvp, &cn, &va); if (!error) { unionfs_node_update(unp, uvp, td); /* * XXX The bug which cannot set uid/gid was corrected. * Ignore errors. */ va.va_type = VNON; VOP_SETATTR(uvp, &va, cn.cn_cred, td); } vn_finished_write(mp); unionfs_mkshadowdir_free_out: if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } unionfs_mkshadowdir_abort: cnp->cn_cred = credbk; chgproccnt(cred->cr_ruidinfo, -1, 0); crfree(cred); return (error); } /* * Create a new whiteout. * * dvp should be locked on entry and will be locked on return. */ int unionfs_mkwhiteout(struct vnode *dvp, struct componentname *cnp, struct thread *td, char *path) { int error; struct vnode *wvp; struct componentname cn; struct mount *mp; if (path == NULL) path = cnp->cn_nameptr; wvp = NULLVP; if ((error = unionfs_relookup(dvp, &wvp, cnp, &cn, td, path, strlen(path), CREATE))) return (error); if (wvp != NULLVP) { if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } if (dvp == wvp) vrele(wvp); else vput(wvp); return (EEXIST); } if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH))) goto unionfs_mkwhiteout_free_out; if (!(error = VOP_LEASE(dvp, td, td->td_ucred, LEASE_WRITE))) error = VOP_WHITEOUT(dvp, &cn, CREATE); vn_finished_write(mp); unionfs_mkwhiteout_free_out: if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } return (error); } /* * Create a new vnode for create a new shadow file. * * If an error is returned, *vpp will be invalid, otherwise it will hold a * locked, referenced and opened vnode. * * unp is never updated. */ static int unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, struct unionfs_node *unp, struct vattr *uvap, struct thread *td) { struct unionfs_mount *ump; struct vnode *vp; struct vnode *lvp; struct ucred *cred; struct vattr lva; int fmode; int error; struct componentname cn; ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount); vp = NULLVP; lvp = unp->un_lowervp; cred = td->td_ucred; fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL); error = 0; if ((error = VOP_GETATTR(lvp, &lva, cred, td)) != 0) return (error); unionfs_create_uppervattr_core(ump, &lva, uvap, td); if (unp->un_path == NULL) panic("unionfs: un_path is null"); cn.cn_namelen = strlen(unp->un_path); cn.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); bcopy(unp->un_path, cn.cn_pnbuf, cn.cn_namelen + 1); cn.cn_nameiop = CREATE; cn.cn_flags = (LOCKPARENT | LOCKLEAF | HASBUF | SAVENAME | ISLASTCN); cn.cn_lkflags = LK_EXCLUSIVE; cn.cn_thread = td; cn.cn_cred = cred; cn.cn_nameptr = cn.cn_pnbuf; cn.cn_consume = 0; vref(udvp); if ((error = relookup(udvp, &vp, &cn)) != 0) goto unionfs_vn_create_on_upper_free_out2; vrele(udvp); if (vp != NULLVP) { if (vp == udvp) vrele(vp); else vput(vp); error = EEXIST; goto unionfs_vn_create_on_upper_free_out1; } if ((error = VOP_LEASE(udvp, td, cred, LEASE_WRITE)) != 0) goto unionfs_vn_create_on_upper_free_out1; if ((error = VOP_CREATE(udvp, &vp, &cn, uvap)) != 0) goto unionfs_vn_create_on_upper_free_out1; if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) { vput(vp); goto unionfs_vn_create_on_upper_free_out1; } vp->v_writecount++; *vpp = vp; unionfs_vn_create_on_upper_free_out1: VOP_UNLOCK(udvp, 0, td); unionfs_vn_create_on_upper_free_out2: if (cn.cn_flags & HASBUF) { uma_zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } return (error); } /* * Copy from lvp to uvp. * * lvp and uvp should be locked and opened on entry and will be locked and * opened on return. */ static int unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp, struct ucred *cred, struct thread *td) { int error; off_t offset; int count; int bufoffset; char *buf; struct uio uio; struct iovec iov; error = 0; memset(&uio, 0, sizeof(uio)); uio.uio_td = td; uio.uio_segflg = UIO_SYSSPACE; uio.uio_offset = 0; if ((error = VOP_LEASE(lvp, td, cred, LEASE_READ)) != 0) return (error); if ((error = VOP_LEASE(uvp, td, cred, LEASE_WRITE)) != 0) return (error); buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); while (error == 0) { offset = uio.uio_offset; uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = buf; iov.iov_len = MAXBSIZE; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_READ; if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0) break; if ((count = MAXBSIZE - uio.uio_resid) == 0) break; bufoffset = 0; while (bufoffset < count) { uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = buf + bufoffset; iov.iov_len = count - bufoffset; uio.uio_offset = offset + bufoffset; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_WRITE; if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0) break; bufoffset += (count - bufoffset) - uio.uio_resid; } uio.uio_offset = offset + bufoffset; } free(buf, M_TEMP); return (error); } /* * Copy file from lower to upper. * * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to * docopy. * * If no error returned, unp will be updated. */ int unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, struct thread *td) { int error; struct mount *mp; struct vnode *udvp; struct vnode *lvp; struct vnode *uvp; struct vattr uva; lvp = unp->un_lowervp; uvp = NULLVP; if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (unp->un_dvp == NULLVP) return (EINVAL); if (unp->un_uppervp != NULLVP) return (EEXIST); udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp; if (udvp == NULLVP) return (EROFS); if ((udvp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); error = VOP_ACCESS(lvp, VREAD, cred, td); if (error != 0) return (error); if ((error = vn_start_write(udvp, &mp, V_WAIT | PCATCH)) != 0) return (error); error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td); if (error != 0) { vn_finished_write(mp); return (error); } if (docopy != 0) { error = VOP_OPEN(lvp, FREAD, cred, td, NULL); if (error == 0) { error = unionfs_copyfile_core(lvp, uvp, cred, td); VOP_CLOSE(lvp, FREAD, cred, td); } } VOP_CLOSE(uvp, FWRITE, cred, td); uvp->v_writecount--; vn_finished_write(mp); if (error == 0) { /* Reset the attributes. Ignore errors. */ uva.va_type = VNON; VOP_SETATTR(uvp, &uva, cred, td); } unionfs_node_update(unp, uvp, td); return (error); } /* * It checks whether vp can rmdir. (check empty) * * vp is unionfs vnode. * vp should be locked. */ int unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) { int error; int eofflag; int lookuperr; struct vnode *uvp; struct vnode *lvp; struct vnode *tvp; struct vattr va; struct componentname cn; /* * The size of buf needs to be larger than DIRBLKSIZ. */ char buf[256 * 6]; struct dirent *dp; struct dirent *edp; struct uio uio; struct iovec iov; ASSERT_VOP_ELOCKED(vp, "unionfs_check_rmdir"); eofflag = 0; uvp = UNIONFSVPTOUPPERVP(vp); lvp = UNIONFSVPTOLOWERVP(vp); /* check opaque */ if ((error = VOP_GETATTR(uvp, &va, cred, td)) != 0) return (error); if (va.va_flags & OPAQUE) return (0); /* open vnode */ #ifdef MAC if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0) return (error); #endif if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0) return (error); if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0) return (error); uio.uio_rw = UIO_READ; uio.uio_segflg = UIO_SYSSPACE; uio.uio_td = td; uio.uio_offset = 0; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, lvp); #endif while (!error && !eofflag) { iov.iov_base = buf; iov.iov_len = sizeof(buf); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = iov.iov_len; error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL); if (error) break; edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid]; for (dp = (struct dirent*)buf; !error && dp < edp; dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) { if (dp->d_type == DT_WHT || (dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2))) continue; cn.cn_namelen = dp->d_namlen; cn.cn_pnbuf = NULL; cn.cn_nameptr = dp->d_name; cn.cn_nameiop = LOOKUP; cn.cn_flags = (LOCKPARENT | LOCKLEAF | SAVENAME | RDONLY | ISLASTCN); cn.cn_lkflags = LK_EXCLUSIVE; cn.cn_thread = td; cn.cn_cred = cred; cn.cn_consume = 0; /* * check entry in lower. * Sometimes, readdir function returns * wrong entry. */ lookuperr = VOP_LOOKUP(lvp, &tvp, &cn); if (!lookuperr) vput(tvp); else continue; /* skip entry */ /* * check entry * If it has no exist/whiteout entry in upper, * directory is not empty. */ cn.cn_flags = (LOCKPARENT | LOCKLEAF | SAVENAME | RDONLY | ISLASTCN); lookuperr = VOP_LOOKUP(uvp, &tvp, &cn); if (!lookuperr) vput(tvp); /* ignore exist or whiteout entry */ if (!lookuperr || (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT))) continue; error = ENOTEMPTY; } } /* close vnode */ VOP_CLOSE(vp, FREAD, cred, td); return (error); } #ifdef DIAGNOSTIC struct vnode * unionfs_checkuppervp(struct vnode *vp, char *fil, int lno) { struct unionfs_node *unp; unp = VTOUNIONFS(vp); #ifdef notyet if (vp->v_op != unionfs_vnodeop_p) { printf("unionfs_checkuppervp: on non-unionfs-node.\n"); #ifdef KDB kdb_enter(KDB_WHY_UNIONFS, "unionfs_checkuppervp: on non-unionfs-node.\n"); #endif panic("unionfs_checkuppervp"); }; #endif return (unp->un_uppervp); } struct vnode * unionfs_checklowervp(struct vnode *vp, char *fil, int lno) { struct unionfs_node *unp; unp = VTOUNIONFS(vp); #ifdef notyet if (vp->v_op != unionfs_vnodeop_p) { printf("unionfs_checklowervp: on non-unionfs-node.\n"); #ifdef KDB kdb_enter(KDB_WHY_UNIONFS, "unionfs_checklowervp: on non-unionfs-node.\n"); #endif panic("unionfs_checklowervp"); }; #endif return (unp->un_lowervp); } #endif Index: head/sys/fs/unionfs/union_vfsops.c =================================================================== --- head/sys/fs/unionfs/union_vfsops.c (revision 175201) +++ head/sys/fs/unionfs/union_vfsops.c (revision 175202) @@ -1,562 +1,562 @@ /*- * Copyright (c) 1994, 1995 The Regents of the University of California. * Copyright (c) 1994, 1995 Jan-Simon Pendry. * Copyright (c) 2005, 2006 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006 Daichi Goto * All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_UNIONFSMNT, "UNIONFS mount", "UNIONFS mount structure"); static vfs_fhtovp_t unionfs_fhtovp; static vfs_checkexp_t unionfs_checkexp; static vfs_mount_t unionfs_domount; static vfs_quotactl_t unionfs_quotactl; static vfs_root_t unionfs_root; static vfs_sync_t unionfs_sync; static vfs_statfs_t unionfs_statfs; static vfs_unmount_t unionfs_unmount; static vfs_vget_t unionfs_vget; static vfs_extattrctl_t unionfs_extattrctl; static struct vfsops unionfs_vfsops; /* * Exchange from userland file mode to vmode. */ static u_short mode2vmode(mode_t mode) { u_short ret; ret = 0; /* other */ if (mode & S_IXOTH) ret |= VEXEC >> 6; if (mode & S_IWOTH) ret |= VWRITE >> 6; if (mode & S_IROTH) ret |= VREAD >> 6; /* group */ if (mode & S_IXGRP) ret |= VEXEC >> 3; if (mode & S_IWGRP) ret |= VWRITE >> 3; if (mode & S_IRGRP) ret |= VREAD >> 3; /* owner */ if (mode & S_IXUSR) ret |= VEXEC; if (mode & S_IWUSR) ret |= VWRITE; if (mode & S_IRUSR) ret |= VREAD; return (ret); } /* * Mount unionfs layer. */ static int unionfs_domount(struct mount *mp, struct thread *td) { int error; struct vnode *lowerrootvp; struct vnode *upperrootvp; struct unionfs_mount *ump; char *target; char *tmp; char *ep; int len; size_t done; int below; uid_t uid; gid_t gid; u_short udir; u_short ufile; unionfs_copymode copymode; unionfs_whitemode whitemode; struct componentname fakecn; struct nameidata nd, *ndp; struct vattr va; UNIONFSDEBUG("unionfs_mount(mp = %p)\n", (void *)mp); error = 0; below = 0; uid = 0; gid = 0; udir = 0; ufile = 0; copymode = UNIONFS_TRANSPARENT; /* default */ whitemode = UNIONFS_WHITE_ALWAYS; ndp = &nd; if (mp->mnt_flag & MNT_ROOTFS) { vfs_mount_error(mp, "Cannot union mount root filesystem"); return (EOPNOTSUPP); } /* * Update is a no operation. */ if (mp->mnt_flag & MNT_UPDATE) { vfs_mount_error(mp, "unionfs does not support mount update"); return (EOPNOTSUPP); } /* * Get argument */ error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len); if (error) error = vfs_getopt(mp->mnt_optnew, "from", (void **)&target, &len); if (error || target[len - 1] != '\0') { vfs_mount_error(mp, "Invalid target"); return (EINVAL); } if (vfs_getopt(mp->mnt_optnew, "below", NULL, NULL) == 0) below = 1; if (vfs_getopt(mp->mnt_optnew, "udir", (void **)&tmp, NULL) == 0) { if (tmp != NULL) udir = (mode_t)strtol(tmp, &ep, 8); if (tmp == NULL || *ep) { vfs_mount_error(mp, "Invalid udir"); return (EINVAL); } udir = mode2vmode(udir); } if (vfs_getopt(mp->mnt_optnew, "ufile", (void **)&tmp, NULL) == 0) { if (tmp != NULL) ufile = (mode_t)strtol(tmp, &ep, 8); if (tmp == NULL || *ep) { vfs_mount_error(mp, "Invalid ufile"); return (EINVAL); } ufile = mode2vmode(ufile); } /* check umask, uid and gid */ if (udir == 0 && ufile != 0) udir = ufile; if (ufile == 0 && udir != 0) ufile = udir; - vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY, td); + vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY); error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred, td); if (!error) { if (udir == 0) udir = va.va_mode; if (ufile == 0) ufile = va.va_mode; uid = va.va_uid; gid = va.va_gid; } VOP_UNLOCK(mp->mnt_vnodecovered, 0, td); if (error) return (error); if (mp->mnt_cred->cr_ruid == 0) { /* root only */ if (vfs_getopt(mp->mnt_optnew, "uid", (void **)&tmp, NULL) == 0) { if (tmp != NULL) uid = (uid_t)strtol(tmp, &ep, 10); if (tmp == NULL || *ep) { vfs_mount_error(mp, "Invalid uid"); return (EINVAL); } } if (vfs_getopt(mp->mnt_optnew, "gid", (void **)&tmp, NULL) == 0) { if (tmp != NULL) gid = (gid_t)strtol(tmp, &ep, 10); if (tmp == NULL || *ep) { vfs_mount_error(mp, "Invalid gid"); return (EINVAL); } } if (vfs_getopt(mp->mnt_optnew, "copymode", (void **)&tmp, NULL) == 0) { if (tmp == NULL) { vfs_mount_error(mp, "Invalid copymode"); return (EINVAL); } else if (strcasecmp(tmp, "traditional") == 0) copymode = UNIONFS_TRADITIONAL; else if (strcasecmp(tmp, "transparent") == 0) copymode = UNIONFS_TRANSPARENT; else if (strcasecmp(tmp, "masquerade") == 0) copymode = UNIONFS_MASQUERADE; else { vfs_mount_error(mp, "Invalid copymode"); return (EINVAL); } } if (vfs_getopt(mp->mnt_optnew, "whiteout", (void **)&tmp, NULL) == 0) { if (tmp == NULL) { vfs_mount_error(mp, "Invalid whiteout mode"); return (EINVAL); } else if (strcasecmp(tmp, "always") == 0) whitemode = UNIONFS_WHITE_ALWAYS; else if (strcasecmp(tmp, "whenneeded") == 0) whitemode = UNIONFS_WHITE_WHENNEEDED; else { vfs_mount_error(mp, "Invalid whiteout mode"); return (EINVAL); } } } /* If copymode is UNIONFS_TRADITIONAL, uid/gid is mounted user. */ if (copymode == UNIONFS_TRADITIONAL) { uid = mp->mnt_cred->cr_ruid; gid = mp->mnt_cred->cr_rgid; } UNIONFSDEBUG("unionfs_mount: uid=%d, gid=%d\n", uid, gid); UNIONFSDEBUG("unionfs_mount: udir=0%03o, ufile=0%03o\n", udir, ufile); UNIONFSDEBUG("unionfs_mount: copymode=%d\n", copymode); /* * Find upper node */ NDINIT(ndp, LOOKUP, FOLLOW | WANTPARENT | LOCKLEAF, UIO_SYSSPACE, target, td); if ((error = namei(ndp))) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); /* get root vnodes */ lowerrootvp = mp->mnt_vnodecovered; upperrootvp = ndp->ni_vp; vrele(ndp->ni_dvp); ndp->ni_dvp = NULLVP; /* create unionfs_mount */ ump = (struct unionfs_mount *)malloc(sizeof(struct unionfs_mount), M_UNIONFSMNT, M_WAITOK | M_ZERO); /* * Save reference */ if (below) { VOP_UNLOCK(upperrootvp, 0, td); - vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY); ump->um_lowervp = upperrootvp; ump->um_uppervp = lowerrootvp; } else { ump->um_lowervp = lowerrootvp; ump->um_uppervp = upperrootvp; } ump->um_rootvp = NULLVP; ump->um_uid = uid; ump->um_gid = gid; ump->um_udir = udir; ump->um_ufile = ufile; ump->um_copymode = copymode; ump->um_whitemode = whitemode; MNT_ILOCK(mp); if ((lowerrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE) && (upperrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE)) mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); mp->mnt_data = ump; /* * Copy upper layer's RDONLY flag. */ mp->mnt_flag |= ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY; /* * Check whiteout */ if ((mp->mnt_flag & MNT_RDONLY) == 0) { memset(&fakecn, 0, sizeof(fakecn)); fakecn.cn_nameiop = LOOKUP; fakecn.cn_thread = td; error = VOP_WHITEOUT(ump->um_uppervp, &fakecn, LOOKUP); if (error) { if (below) { VOP_UNLOCK(ump->um_uppervp, 0, td); vrele(upperrootvp); } else vput(ump->um_uppervp); free(ump, M_UNIONFSMNT); mp->mnt_data = NULL; return (error); } } /* * Unlock the node */ VOP_UNLOCK(ump->um_uppervp, 0, td); /* * Get the unionfs root vnode. */ error = unionfs_nodeget(mp, ump->um_uppervp, ump->um_lowervp, NULLVP, &(ump->um_rootvp), NULL, td); vrele(upperrootvp); if (error) { free(ump, M_UNIONFSMNT); mp->mnt_data = NULL; return (error); } /* * Check mnt_flag */ if ((ump->um_lowervp->v_mount->mnt_flag & MNT_LOCAL) && (ump->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) mp->mnt_flag |= MNT_LOCAL; /* * Get new fsid */ vfs_getnewfsid(mp); len = MNAMELEN - 1; tmp = mp->mnt_stat.f_mntfromname; copystr((below ? ":" : ":"), tmp, len, &done); len -= done - 1; tmp += done - 1; copystr(target, tmp, len, NULL); UNIONFSDEBUG("unionfs_mount: from %s, on %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); return (0); } /* * Free reference to unionfs layer */ static int unionfs_unmount(struct mount *mp, int mntflags, struct thread *td) { struct unionfs_mount *ump; int error; int num; int freeing; int flags; UNIONFSDEBUG("unionfs_unmount: mp = %p\n", (void *)mp); ump = MOUNTTOUNIONFSMOUNT(mp); flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* vflush (no need to call vrele) */ for (freeing = 0; (error = vflush(mp, 1, flags, td)) != 0;) { num = mp->mnt_nvnodelistsize; if (num == freeing) break; freeing = num; } if (error) return (error); free(ump, M_UNIONFSMNT); mp->mnt_data = 0; return (0); } static int unionfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct unionfs_mount *ump; struct vnode *vp; ump = MOUNTTOUNIONFSMOUNT(mp); vp = ump->um_rootvp; UNIONFSDEBUG("unionfs_root: rootvp=%p locked=%x\n", vp, VOP_ISLOCKED(vp, td)); vref(vp); if (flags & LK_TYPE_MASK) - vn_lock(vp, flags, td); + vn_lock(vp, flags); *vpp = vp; return (0); } static int unionfs_quotactl(struct mount *mp, int cmd, uid_t uid, void *arg, struct thread *td) { struct unionfs_mount *ump; ump = MOUNTTOUNIONFSMOUNT(mp); /* * Writing is always performed to upper vnode. */ return (VFS_QUOTACTL(ump->um_uppervp->v_mount, cmd, uid, arg, td)); } static int unionfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { struct unionfs_mount *ump; int error; struct statfs mstat; uint64_t lbsize; ump = MOUNTTOUNIONFSMOUNT(mp); UNIONFSDEBUG("unionfs_statfs(mp = %p, lvp = %p, uvp = %p)\n", (void *)mp, (void *)ump->um_lowervp, (void *)ump->um_uppervp); bzero(&mstat, sizeof(mstat)); error = VFS_STATFS(ump->um_lowervp->v_mount, &mstat, td); if (error) return (error); /* now copy across the "interesting" information and fake the rest */ sbp->f_blocks = mstat.f_blocks; sbp->f_files = mstat.f_files; lbsize = mstat.f_bsize; error = VFS_STATFS(ump->um_uppervp->v_mount, &mstat, td); if (error) return (error); /* * The FS type etc is copy from upper vfs. * (write able vfs have priority) */ sbp->f_type = mstat.f_type; sbp->f_flags = mstat.f_flags; sbp->f_bsize = mstat.f_bsize; sbp->f_iosize = mstat.f_iosize; if (mstat.f_bsize != lbsize) sbp->f_blocks = ((off_t)sbp->f_blocks * lbsize) / mstat.f_bsize; sbp->f_blocks += mstat.f_blocks; sbp->f_bfree = mstat.f_bfree; sbp->f_bavail = mstat.f_bavail; sbp->f_files += mstat.f_files; sbp->f_ffree = mstat.f_ffree; return (0); } static int unionfs_sync(struct mount *mp, int waitfor, struct thread *td) { /* nothing to do */ return (0); } static int unionfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { return (EOPNOTSUPP); } static int unionfs_fhtovp(struct mount *mp, struct fid *fidp, struct vnode **vpp) { return (EOPNOTSUPP); } static int unionfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp) { return (EOPNOTSUPP); } static int unionfs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int namespace, const char *attrname, struct thread *td) { struct unionfs_mount *ump; struct unionfs_node *unp; ump = MOUNTTOUNIONFSMOUNT(mp); unp = VTOUNIONFS(filename_vp); if (unp->un_uppervp != NULLVP) { return (VFS_EXTATTRCTL(ump->um_uppervp->v_mount, cmd, unp->un_uppervp, namespace, attrname, td)); } else { return (VFS_EXTATTRCTL(ump->um_lowervp->v_mount, cmd, unp->un_lowervp, namespace, attrname, td)); } } static struct vfsops unionfs_vfsops = { .vfs_checkexp = unionfs_checkexp, .vfs_extattrctl = unionfs_extattrctl, .vfs_fhtovp = unionfs_fhtovp, .vfs_init = unionfs_init, .vfs_mount = unionfs_domount, .vfs_quotactl = unionfs_quotactl, .vfs_root = unionfs_root, .vfs_statfs = unionfs_statfs, .vfs_sync = unionfs_sync, .vfs_uninit = unionfs_uninit, .vfs_unmount = unionfs_unmount, .vfs_vget = unionfs_vget, }; VFS_SET(unionfs_vfsops, unionfs, VFCF_LOOPBACK); Index: head/sys/fs/unionfs/union_vnops.c =================================================================== --- head/sys/fs/unionfs/union_vnops.c (revision 175201) +++ head/sys/fs/unionfs/union_vnops.c (revision 175202) @@ -1,2330 +1,2331 @@ /*- * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. * Copyright (c) 1992, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2005, 2006 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006 Daichi Goto * All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vnops.c 8.32 (Berkeley) 6/23/95 * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if 0 #define UNIONFS_INTERNAL_DEBUG(msg, args...) printf(msg, ## args) #define UNIONFS_IDBG_RENAME #else #define UNIONFS_INTERNAL_DEBUG(msg, args...) #endif /* lockmgr lock <-> reverse table */ struct lk_lr_table { int lock; int revlock; }; static struct lk_lr_table un_llt[] = { {LK_SHARED, LK_RELEASE}, {LK_EXCLUSIVE, LK_RELEASE}, {LK_UPGRADE, LK_DOWNGRADE}, {LK_DOWNGRADE, LK_UPGRADE}, {0, 0} }; static int unionfs_lookup(struct vop_cachedlookup_args *ap) { int iswhiteout; int lockflag; int error , uerror, lerror; u_long nameiop; u_long cnflags, cnflagsbk; struct unionfs_node *dunp; struct vnode *dvp, *udvp, *ldvp, *vp, *uvp, *lvp, *dtmpvp; struct vattr va; struct componentname *cnp; struct thread *td; iswhiteout = 0; lockflag = 0; error = uerror = lerror = ENOENT; cnp = ap->a_cnp; nameiop = cnp->cn_nameiop; cnflags = cnp->cn_flags; dvp = ap->a_dvp; dunp = VTOUNIONFS(dvp); udvp = dunp->un_uppervp; ldvp = dunp->un_lowervp; vp = uvp = lvp = NULLVP; td = curthread; *(ap->a_vpp) = NULLVP; UNIONFS_INTERNAL_DEBUG("unionfs_lookup: enter: nameiop=%ld, flags=%lx, path=%s\n", nameiop, cnflags, cnp->cn_nameptr); if (dvp->v_type != VDIR) return (ENOTDIR); /* * If read-only and op is not LOOKUP, will return EROFS. */ if ((cnflags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && LOOKUP != nameiop) return (EROFS); /* * lookup dotdot */ if (cnflags & ISDOTDOT) { if (LOOKUP != nameiop && udvp == NULLVP) return (EROFS); if (udvp != NULLVP) { dtmpvp = udvp; if (ldvp != NULLVP) VOP_UNLOCK(ldvp, 0, td); } else dtmpvp = ldvp; error = VOP_LOOKUP(dtmpvp, &vp, cnp); if (dtmpvp == udvp && ldvp != NULLVP) { VOP_UNLOCK(udvp, 0, td); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } if (error == 0) { /* * Exchange lock and reference from vp to * dunp->un_dvp. vp is upper/lower vnode, but it * will need to return the unionfs vnode. */ if (nameiop == DELETE || nameiop == RENAME || (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp, 0, td); vrele(vp); VOP_UNLOCK(dvp, 0, td); *(ap->a_vpp) = dunp->un_dvp; vref(dunp->un_dvp); if (nameiop == DELETE || nameiop == RENAME) - vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY); else if (cnp->cn_lkflags & LK_TYPE_MASK) - vn_lock(dunp->un_dvp, cnp->cn_lkflags | LK_RETRY, td); + vn_lock(dunp->un_dvp, cnp->cn_lkflags | + LK_RETRY); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } else if (error == ENOENT && (cnflags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, NULLVP, cnp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); return (error); } /* * lookup upper layer */ if (udvp != NULLVP) { uerror = VOP_LOOKUP(udvp, &uvp, cnp); if (uerror == 0) { if (udvp == uvp) { /* is dot */ vrele(uvp); *(ap->a_vpp) = dvp; vref(dvp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", uerror); return (uerror); } if (nameiop == DELETE || nameiop == RENAME || (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(uvp, 0, td); } /* check whiteout */ if (uerror == ENOENT || uerror == EJUSTRETURN) if (cnp->cn_flags & ISWHITEOUT) iswhiteout = 1; /* don't lookup lower */ if (iswhiteout == 0 && ldvp != NULLVP) if (VOP_GETATTR(udvp, &va, cnp->cn_cred, td) == 0 && (va.va_flags & OPAQUE)) iswhiteout = 1; /* don't lookup lower */ #if 0 UNIONFS_INTERNAL_DEBUG("unionfs_lookup: debug: whiteout=%d, path=%s\n", iswhiteout, cnp->cn_nameptr); #endif } /* * lookup lower layer */ if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) { /* always op is LOOKUP */ cnp->cn_nameiop = LOOKUP; cnflagsbk = cnp->cn_flags; cnp->cn_flags = cnflags; lerror = VOP_LOOKUP(ldvp, &lvp, cnp); cnp->cn_nameiop = nameiop; if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN)) cnp->cn_flags = cnflagsbk; if (lerror == 0) { if (ldvp == lvp) { /* is dot */ if (uvp != NULLVP) vrele(uvp); /* no need? */ vrele(lvp); *(ap->a_vpp) = dvp; vref(dvp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", lerror); return (lerror); } if (cnp->cn_lkflags & LK_TYPE_MASK) VOP_UNLOCK(lvp, 0, td); } } /* * check lookup result */ if (uvp == NULLVP && lvp == NULLVP) { UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", (udvp != NULLVP ? uerror : lerror)); return (udvp != NULLVP ? uerror : lerror); } /* * check vnode type */ if (uvp != NULLVP && lvp != NULLVP && uvp->v_type != lvp->v_type) { vrele(lvp); lvp = NULLVP; } /* * check shadow dir */ if (uerror != 0 && uerror != EJUSTRETURN && udvp != NULLVP && lerror == 0 && lvp != NULLVP && lvp->v_type == VDIR && !(dvp->v_mount->mnt_flag & MNT_RDONLY) && (1 < cnp->cn_namelen || '.' != *(cnp->cn_nameptr))) { /* get unionfs vnode in order to create a new shadow dir. */ error = unionfs_nodeget(dvp->v_mount, NULLVP, lvp, dvp, &vp, cnp, td); if (error != 0) goto unionfs_lookup_out; if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp, 0, td); if (LK_EXCLUSIVE != VOP_ISLOCKED(vp, td)) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); lockflag = 1; } error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount), udvp, VTOUNIONFS(vp), cnp, td); if (lockflag != 0) VOP_UNLOCK(vp, 0, td); if (error != 0) { UNIONFSDEBUG("unionfs_lookup: Unable to create shadow dir."); if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) vput(vp); else vrele(vp); goto unionfs_lookup_out; } if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED) - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); } /* * get unionfs vnode. */ else { if (uvp != NULLVP) error = uerror; else error = lerror; if (error != 0) goto unionfs_lookup_out; error = unionfs_nodeget(dvp->v_mount, uvp, lvp, dvp, &vp, cnp, td); if (error != 0) { UNIONFSDEBUG("unionfs_lookup: Unable to create unionfs vnode."); goto unionfs_lookup_out; } if ((nameiop == DELETE || nameiop == RENAME) && (cnp->cn_lkflags & LK_TYPE_MASK) == 0) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } *(ap->a_vpp) = vp; if (cnflags & MAKEENTRY) cache_enter(dvp, vp, cnp); unionfs_lookup_out: if (uvp != NULLVP) vrele(uvp); if (lvp != NULLVP) vrele(lvp); if (error == ENOENT && (cnflags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, NULLVP, cnp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); return (error); } static int unionfs_create(struct vop_create_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_create: enter\n"); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { if ((error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(vp, 0, td); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); vrele(vp); } } UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error); return (error); } static int unionfs_whiteout(struct vop_whiteout_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: enter\n"); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EOPNOTSUPP; if (udvp != NULLVP) { switch (ap->a_flags) { case CREATE: case DELETE: case LOOKUP: error = VOP_WHITEOUT(udvp, cnp, ap->a_flags); break; default: error = EINVAL; break; } } UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: leave (%d)\n", error); return (error); } static int unionfs_mknod(struct vop_mknod_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_mknod: enter\n"); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { if ((error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(vp, 0, td); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); vrele(vp); } } UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error); return (error); } static int unionfs_open(struct vop_open_args *ap) { int error; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *uvp; struct vnode *lvp; struct vnode *targetvp; struct ucred *cred; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n"); error = 0; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; targetvp = NULLVP; cred = ap->a_cred; td = ap->a_td; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0 || unsp->uns_upper_opencnt > 0) { /* vnode is already opend. */ if (unsp->uns_upper_opencnt > 0) targetvp = uvp; else targetvp = lvp; if (targetvp == lvp && (ap->a_mode & FWRITE) && lvp->v_type == VREG) targetvp = NULLVP; } if (targetvp == NULLVP) { if (uvp == NULLVP) { if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) { error = unionfs_copyfile(unp, !(ap->a_mode & O_TRUNC), cred, td); if (error != 0) goto unionfs_open_abort; targetvp = uvp = unp->un_uppervp; } else targetvp = lvp; } else targetvp = uvp; } error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp); if (error == 0) { if (targetvp == uvp) { if (uvp->v_type == VDIR && lvp != NULLVP && unsp->uns_lower_opencnt <= 0) { /* open lower for readdir */ error = VOP_OPEN(lvp, FREAD, cred, td, NULL); if (error != 0) { VOP_CLOSE(uvp, ap->a_mode, cred, td); goto unionfs_open_abort; } unsp->uns_node_flag |= UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt++; } unsp->uns_upper_opencnt++; } else { unsp->uns_lower_opencnt++; unsp->uns_lower_openmode = ap->a_mode; } ap->a_vp->v_object = targetvp->v_object; } unionfs_open_abort: if (error != 0) unionfs_tryrem_node_status(unp, td, unsp); UNIONFS_INTERNAL_DEBUG("unionfs_open: leave (%d)\n", error); return (error); } static int unionfs_close(struct vop_close_args *ap) { int error; int locked; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct ucred *cred; struct thread *td; struct vnode *ovp; UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n"); locked = 0; unp = VTOUNIONFS(ap->a_vp); cred = ap->a_cred; td = ap->a_td; if (VOP_ISLOCKED(ap->a_vp, td) != LK_EXCLUSIVE) { - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); locked = 1; } unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) { #ifdef DIAGNOSTIC printf("unionfs_close: warning: open count is 0\n"); #endif if (unp->un_uppervp != NULLVP) ovp = unp->un_uppervp; else ovp = unp->un_lowervp; } else if (unsp->uns_upper_opencnt > 0) ovp = unp->un_uppervp; else ovp = unp->un_lowervp; error = VOP_CLOSE(ovp, ap->a_fflag, cred, td); if (error != 0) goto unionfs_close_abort; ap->a_vp->v_object = ovp->v_object; if (ovp == unp->un_uppervp) { unsp->uns_upper_opencnt--; if (unsp->uns_upper_opencnt == 0) { if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) { VOP_CLOSE(unp->un_lowervp, FREAD, cred, td); unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt--; } if (unsp->uns_lower_opencnt > 0) ap->a_vp->v_object = unp->un_lowervp->v_object; } } else unsp->uns_lower_opencnt--; unionfs_close_abort: unionfs_tryrem_node_status(unp, td, unsp); if (locked != 0) VOP_UNLOCK(ap->a_vp, 0, td); UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error); return (error); } /* * Check the access mode toward shadow file/dir. */ static int unionfs_check_corrected_access(u_short mode, struct vattr *va, struct ucred *cred) { int count; uid_t uid; /* upper side vnode's uid */ gid_t gid; /* upper side vnode's gid */ u_short vmode; /* upper side vnode's mode */ gid_t *gp; u_short mask; mask = 0; uid = va->va_uid; gid = va->va_gid; vmode = va->va_mode; /* check owner */ if (cred->cr_uid == uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((vmode & mask) == mask ? 0 : EACCES); } /* check group */ count = 0; gp = cred->cr_groups; for (; count < cred->cr_ngroups; count++, gp++) { if (gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((vmode & mask) == mask ? 0 : EACCES); } } /* check other */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((vmode & mask) == mask ? 0 : EACCES); } static int unionfs_access(struct vop_access_args *ap) { struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; int mode; int error; UNIONFS_INTERNAL_DEBUG("unionfs_access: enter\n"); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; mode = ap->a_mode; error = EACCES; if ((mode & VWRITE) && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } if (uvp != NULLVP) { error = VOP_ACCESS(uvp, mode, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } if (lvp != NULLVP) { if (mode & VWRITE) { if (ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } else if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { /* check shadow file/dir */ if (ump->um_copymode != UNIONFS_TRANSPARENT) { error = unionfs_create_uppervattr(ump, lvp, &va, ap->a_cred, td); if (error != 0) return (error); error = unionfs_check_corrected_access( mode, &va, ap->a_cred); if (error != 0) return (error); } } mode &= ~VWRITE; mode |= VREAD; /* will copy to upper */ } error = VOP_ACCESS(lvp, mode, ap->a_cred, td); } UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } static int unionfs_getattr(struct vop_getattr_args *ap) { int error; struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: enter\n"); unp = VTOUNIONFS(ap->a_vp); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (uvp != NULLVP) { if ((error = VOP_GETATTR(uvp, ap->a_vap, ap->a_cred, td)) == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } error = VOP_GETATTR(lvp, ap->a_vap, ap->a_cred, td); if (error == 0 && !(ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY)) { /* correct the attr toward shadow file/dir. */ if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { unionfs_create_uppervattr_core(ump, ap->a_vap, &va, td); ap->a_vap->va_mode = va.va_mode; ap->a_vap->va_uid = va.va_uid; ap->a_vap->va_gid = va.va_gid; } } if (error == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } static int unionfs_setattr(struct vop_setattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr *vap; UNIONFS_INTERNAL_DEBUG("unionfs_setattr: enter\n"); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; vap = ap->a_vap; if ((ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) && (vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { error = unionfs_copyfile(unp, (vap->va_size != 0), ap->a_cred, td); if (error != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETATTR(uvp, vap, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error); return (error); } static int unionfs_read(struct vop_read_args *ap) { int error; struct unionfs_node *unp; struct vnode *tvp; /* UNIONFS_INTERNAL_DEBUG("unionfs_read: enter\n"); */ unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READ(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_read: leave (%d)\n", error); */ return (error); } static int unionfs_write(struct vop_write_args *ap) { int error; struct unionfs_node *unp; struct vnode *tvp; /* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */ unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */ return (error); } static int unionfs_lease(struct vop_lease_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_lease: enter\n"); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_LEASE(vp, ap->a_td, ap->a_cred, ap->a_flag); UNIONFS_INTERNAL_DEBUG("unionfs_lease: lease (%d)\n", error); return (error); } static int unionfs_ioctl(struct vop_ioctl_args *ap) { int error; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: enter\n"); - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td); + vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, ap->a_td, unsp); VOP_UNLOCK(ap->a_vp, 0, ap->a_td); if (ovp == NULLVP) return (EBADF); error = VOP_IOCTL(ovp, ap->a_command, ap->a_data, ap->a_fflag, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: lease (%d)\n", error); return (error); } static int unionfs_poll(struct vop_poll_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td); + vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, ap->a_td, unsp); VOP_UNLOCK(ap->a_vp, 0, ap->a_td); if (ovp == NULLVP) return (EBADF); return (VOP_POLL(ovp, ap->a_events, ap->a_cred, ap->a_td)); } static int unionfs_fsync(struct vop_fsync_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, ap->a_td, unsp); if (ovp == NULLVP) return (EBADF); return (VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td)); } static int unionfs_remove(struct vop_remove_args *ap) { int error; struct unionfs_node *dunp; struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; struct componentname *cnp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_remove: enter\n"); error = 0; dunp = VTOUNIONFS(ap->a_dvp); unp = VTOUNIONFS(ap->a_vp); udvp = dunp->un_uppervp; uvp = unp->un_uppervp; lvp = unp->un_lowervp; cnp = ap->a_cnp; td = curthread; if (udvp == NULLVP) return (EROFS); if (uvp != NULLVP) { ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) cnp->cn_flags |= DOWHITEOUT; error = VOP_REMOVE(udvp, uvp, cnp); } else if (lvp != NULLVP) error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path); UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error); return (error); } static int unionfs_link(struct vop_link_args *ap) { int error; int needrelookup; struct unionfs_node *dunp; struct unionfs_node *unp; struct vnode *udvp; struct vnode *uvp; struct componentname *cnp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n"); error = 0; needrelookup = 0; dunp = VTOUNIONFS(ap->a_tdvp); unp = NULL; udvp = dunp->un_uppervp; uvp = NULLVP; cnp = ap->a_cnp; td = curthread; if (udvp == NULLVP) return (EROFS); if (ap->a_vp->v_op != &unionfs_vnodeops) uvp = ap->a_vp; else { unp = VTOUNIONFS(ap->a_vp); if (unp->un_uppervp == NULLVP) { if (ap->a_vp->v_type != VREG) return (EOPNOTSUPP); error = unionfs_copyfile(unp, 1, cnp->cn_cred, td); if (error != 0) return (error); needrelookup = 1; } uvp = unp->un_uppervp; } if (needrelookup != 0) error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td); if (error == 0) error = VOP_LINK(udvp, uvp, cnp); UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error); return (error); } static int unionfs_rename(struct vop_rename_args *ap) { int error; struct vnode *fdvp; struct vnode *fvp; struct componentname *fcnp; struct vnode *tdvp; struct vnode *tvp; struct componentname *tcnp; struct vnode *ltdvp; struct vnode *ltvp; struct thread *td; /* rename target vnodes */ struct vnode *rfdvp; struct vnode *rfvp; struct vnode *rtdvp; struct vnode *rtvp; int needrelookup; struct unionfs_mount *ump; struct unionfs_node *unp; UNIONFS_INTERNAL_DEBUG("unionfs_rename: enter\n"); error = 0; fdvp = ap->a_fdvp; fvp = ap->a_fvp; fcnp = ap->a_fcnp; tdvp = ap->a_tdvp; tvp = ap->a_tvp; tcnp = ap->a_tcnp; ltdvp = NULLVP; ltvp = NULLVP; td = curthread; rfdvp = fdvp; rfvp = fvp; rtdvp = tdvp; rtvp = tvp; needrelookup = 0; #ifdef DIAGNOSTIC if (!(fcnp->cn_flags & HASBUF) || !(tcnp->cn_flags & HASBUF)) panic("unionfs_rename: no name"); #endif /* check for cross device rename */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULLVP && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto unionfs_rename_abort; } /* Renaming a file to itself has no effect. */ if (fvp == tvp) goto unionfs_rename_abort; /* * from/to vnode is unionfs node. */ unp = VTOUNIONFS(fdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fdvp=%p, ufdvp=%p, lfdvp=%p\n", fdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rfdvp = unp->un_uppervp; vref(rfdvp); unp = VTOUNIONFS(fvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp); #endif ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount); if (unp->un_uppervp == NULLVP) { switch (fvp->v_type) { case VREG: - if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto unionfs_rename_abort; error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td); VOP_UNLOCK(fvp, 0, td); if (error != 0) goto unionfs_rename_abort; break; case VDIR: - if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto unionfs_rename_abort; error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td); VOP_UNLOCK(fvp, 0, td); if (error != 0) goto unionfs_rename_abort; break; default: error = ENODEV; goto unionfs_rename_abort; } needrelookup = 1; } if (unp->un_lowervp != NULLVP) fcnp->cn_flags |= DOWHITEOUT; rfvp = unp->un_uppervp; vref(rfvp); unp = VTOUNIONFS(tdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tdvp=%p, utdvp=%p, ltdvp=%p\n", tdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rtdvp = unp->un_uppervp; ltdvp = unp->un_lowervp; vref(rtdvp); if (tdvp == tvp) { rtvp = rtdvp; vref(rtvp); } else if (tvp != NULLVP) { unp = VTOUNIONFS(tvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tvp=%p, utvp=%p, ltvp=%p\n", tvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) rtvp = NULLVP; else { if (tvp->v_type == VDIR) { error = EINVAL; goto unionfs_rename_abort; } rtvp = unp->un_uppervp; ltvp = unp->un_lowervp; vref(rtvp); } } if (needrelookup != 0) { - if ((error = vn_lock(fdvp, LK_EXCLUSIVE, td)) != 0) + if ((error = vn_lock(fdvp, LK_EXCLUSIVE)) != 0) goto unionfs_rename_abort; error = unionfs_relookup_for_delete(fdvp, fcnp, td); VOP_UNLOCK(fdvp, 0, td); if (error != 0) goto unionfs_rename_abort; /* Locke of tvp is canceled in order to avoid recursive lock. */ if (tvp != NULLVP && tvp != tdvp) VOP_UNLOCK(tvp, 0, td); error = unionfs_relookup_for_rename(tdvp, tcnp, td); if (tvp != NULLVP && tvp != tdvp) - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto unionfs_rename_abort; } error = VOP_RENAME(rfdvp, rfvp, fcnp, rtdvp, rtvp, tcnp); if (error == 0) { if (rtvp != NULLVP && rtvp->v_type == VDIR) cache_purge(tdvp); if (fvp->v_type == VDIR && fdvp != tdvp) cache_purge(fdvp); } if (fdvp != rfdvp) vrele(fdvp); if (fvp != rfvp) vrele(fvp); if (ltdvp != NULLVP) VOP_UNLOCK(ltdvp, 0, td); if (tdvp != rtdvp) vrele(tdvp); if (ltvp != NULLVP) VOP_UNLOCK(ltvp, 0, td); if (tvp != rtvp && tvp != NULLVP) { if (rtvp == NULLVP) vput(tvp); else vrele(tvp); } UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); unionfs_rename_abort: if (fdvp != rfdvp) vrele(rfdvp); if (fvp != rfvp) vrele(rfvp); if (tdvp != rtdvp) vrele(rtdvp); vput(tdvp); if (tvp != rtvp && rtvp != NULLVP) vrele(rtvp); if (tvp != NULLVP) { if (tdvp != tvp) vput(tvp); else vrele(tvp); } vrele(fdvp); vrele(fvp); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); } static int unionfs_mkdir(struct vop_mkdir_args *ap) { int error; int lkflags; struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; struct vattr va; UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: enter\n"); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; td = curthread; udvp = dunp->un_uppervp; if (udvp != NULLVP) { /* check opaque */ if (!(cnp->cn_flags & ISWHITEOUT)) { error = VOP_GETATTR(udvp, &va, cnp->cn_cred, td); if (error != 0) return (error); if (va.va_flags & OPAQUE) cnp->cn_flags |= ISWHITEOUT; } if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(uvp, 0, td); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); cnp->cn_lkflags = lkflags; vrele(uvp); } } UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error); return (error); } static int unionfs_rmdir(struct vop_rmdir_args *ap) { int error; struct unionfs_node *dunp; struct unionfs_node *unp; struct unionfs_mount *ump; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: enter\n"); error = 0; dunp = VTOUNIONFS(ap->a_dvp); unp = VTOUNIONFS(ap->a_vp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (udvp == NULLVP) return (EROFS); if (udvp == uvp) return (EOPNOTSUPP); if (uvp != NULLVP) { if (lvp != NULLVP) { error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td); if (error != 0) return (error); } ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) cnp->cn_flags |= DOWHITEOUT; error = VOP_RMDIR(udvp, uvp, cnp); } else if (lvp != NULLVP) error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path); if (error == 0) { cache_purge(ap->a_dvp); cache_purge(ap->a_vp); } UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: leave (%d)\n", error); return (error); } static int unionfs_symlink(struct vop_symlink_args *ap) { int error; int lkflags; struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; UNIONFS_INTERNAL_DEBUG("unionfs_symlink: enter\n"); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; td = curthread; udvp = dunp->un_uppervp; if (udvp != NULLVP) { error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target); if (error == 0) { VOP_UNLOCK(uvp, 0, td); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); cnp->cn_lkflags = lkflags; vrele(uvp); } } UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error); return (error); } static int unionfs_readdir(struct vop_readdir_args *ap) { int error; int eofflag; int locked; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct uio *uio; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; int ncookies_bk; u_long *cookies_bk; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: enter\n"); error = 0; eofflag = 0; locked = 0; unp = VTOUNIONFS(ap->a_vp); uio = ap->a_uio; uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = uio->uio_td; ncookies_bk = 0; cookies_bk = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); /* check opaque */ if (uvp != NULLVP && lvp != NULLVP) { if ((error = VOP_GETATTR(uvp, &va, ap->a_cred, td)) != 0) goto unionfs_readdir_exit; if (va.va_flags & OPAQUE) lvp = NULLVP; } /* check the open count. unionfs needs to open before readdir. */ if (VOP_ISLOCKED(ap->a_vp, td) != LK_EXCLUSIVE) { - vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY, td); + vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY); locked = 1; } unionfs_get_node_status(unp, td, &unsp); if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) || (lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) { unionfs_tryrem_node_status(unp, td, unsp); error = EBADF; } if (locked == 1) - vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY, td); + vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY); if (error != 0) goto unionfs_readdir_exit; /* upper only */ if (uvp != NULLVP && lvp == NULLVP) { error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unsp->uns_readdir_status = 0; goto unionfs_readdir_exit; } /* lower only */ if (uvp == NULLVP && lvp != NULLVP) { error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unsp->uns_readdir_status = 2; goto unionfs_readdir_exit; } /* * readdir upper and lower */ KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp")); KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp")); if (uio->uio_offset == 0) unsp->uns_readdir_status = 0; if (unsp->uns_readdir_status == 0) { /* read upper */ error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag, ap->a_ncookies, ap->a_cookies); if (error != 0 || eofflag == 0) goto unionfs_readdir_exit; unsp->uns_readdir_status = 1; /* * ufs(and other fs) needs size of uio_resid larger than * DIRBLKSIZ. * size of DIRBLKSIZ equals DEV_BSIZE. * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h) */ if (uio->uio_resid <= (uio->uio_resid & (DEV_BSIZE -1))) goto unionfs_readdir_exit; /* * backup cookies * It prepares to readdir in lower. */ if (ap->a_ncookies != NULL) { ncookies_bk = *(ap->a_ncookies); *(ap->a_ncookies) = 0; } if (ap->a_cookies != NULL) { cookies_bk = *(ap->a_cookies); *(ap->a_cookies) = NULL; } } /* initialize for readdir in lower */ if (unsp->uns_readdir_status == 1) { unsp->uns_readdir_status = 2; uio->uio_offset = 0; } if (lvp == NULLVP) { error = EBADF; goto unionfs_readdir_exit; } /* read lower */ error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); if (cookies_bk != NULL) { /* merge cookies */ int size; u_long *newcookies, *pos; size = *(ap->a_ncookies) + ncookies_bk; newcookies = (u_long *) malloc(size * sizeof(u_long), M_TEMP, M_WAITOK); pos = newcookies; memcpy(pos, cookies_bk, ncookies_bk * sizeof(u_long)); pos += ncookies_bk * sizeof(u_long); memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(u_long)); free(cookies_bk, M_TEMP); free(*(ap->a_cookies), M_TEMP); *(ap->a_ncookies) = size; *(ap->a_cookies) = newcookies; } unionfs_readdir_exit: if (error != 0 && ap->a_eofflag != NULL) *(ap->a_eofflag) = 1; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error); return (error); } static int unionfs_readlink(struct vop_readlink_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_readlink: enter\n"); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READLINK(vp, ap->a_uio, ap->a_cred); UNIONFS_INTERNAL_DEBUG("unionfs_readlink: leave (%d)\n", error); return (error); } static int unionfs_getwritemount(struct vop_getwritemount_args *ap) { int error; struct vnode *uvp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: enter\n"); error = 0; vp = ap->a_vp; if (vp == NULLVP || (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EACCES); uvp = UNIONFSVPTOUPPERVP(vp); if (uvp == NULLVP && VREG == vp->v_type) uvp = UNIONFSVPTOUPPERVP(VTOUNIONFS(vp)->un_dvp); if (uvp != NULLVP) error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp); else { VI_LOCK(vp); if (vp->v_iflag & VI_FREE) error = EOPNOTSUPP; else error = EACCES; VI_UNLOCK(vp); } UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: leave (%d)\n", error); return (error); } static int unionfs_inactive(struct vop_inactive_args *ap) { ap->a_vp->v_object = NULL; vrecycle(ap->a_vp, ap->a_td); return (0); } static int unionfs_reclaim(struct vop_reclaim_args *ap) { /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: enter\n"); */ unionfs_noderem(ap->a_vp, ap->a_td); /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: leave\n"); */ return (0); } static int unionfs_print(struct vop_print_args *ap) { struct unionfs_node *unp; /* struct unionfs_node_status *unsp; */ unp = VTOUNIONFS(ap->a_vp); /* unionfs_get_node_status(unp, curthread, &unsp); */ printf("unionfs_vp=%p, uppervp=%p, lowervp=%p\n", ap->a_vp, unp->un_uppervp, unp->un_lowervp); /* printf("unionfs opencnt: uppervp=%d, lowervp=%d\n", unsp->uns_upper_opencnt, unsp->uns_lower_opencnt); */ if (unp->un_uppervp != NULLVP) vprint("unionfs: upper", unp->un_uppervp); if (unp->un_lowervp != NULLVP) vprint("unionfs: lower", unp->un_lowervp); return (0); } static int unionfs_get_llt_revlock(int flags) { int count; flags &= LK_TYPE_MASK; for (count = 0; un_llt[count].lock != 0; count++) { if (flags == un_llt[count].lock) { return un_llt[count].revlock; } } return 0; } static int unionfs_lock(struct vop_lock1_args *ap) { int error; int flags; int revlock; int uhold; struct mount *mp; struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct thread *td; error = 0; uhold = 0; flags = ap->a_flags; vp = ap->a_vp; td = ap->a_td; if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK)) return (VOP_UNLOCK(vp, flags, td)); if ((revlock = unionfs_get_llt_revlock(flags)) == 0) panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); mp = vp->v_mount; if (mp == NULL) goto unionfs_lock_null_vnode; ump = MOUNTTOUNIONFSMOUNT(mp); unp = VTOUNIONFS(vp); if (ump == NULL || unp == NULL) goto unionfs_lock_null_vnode; lvp = unp->un_lowervp; uvp = unp->un_uppervp; if ((mp->mnt_kern_flag & MNTK_MPSAFE) != 0 && (vp->v_iflag & VI_OWEINACT) != 0) flags |= LK_NOWAIT; /* * Sometimes, lower or upper is already exclusive locked. * (ex. vfs_domount: mounted vnode is already locked.) */ if ((flags & LK_TYPE_MASK) == LK_EXCLUSIVE && vp == ump->um_rootvp) flags |= LK_CANRECURSE; if (lvp != NULLVP) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(lvp); VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_LOCK(lvp, flags, td); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) { VI_UNLOCK(vp); if (error == 0) VOP_UNLOCK(lvp, 0, td); vdrop(lvp); return (vop_stdlock(ap)); } } if (error == 0 && uvp != NULLVP) { VI_LOCK_FLAGS(uvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(uvp); uhold = 1; VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_LOCK(uvp, flags, td); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) { VI_UNLOCK(vp); if (error == 0) { VOP_UNLOCK(uvp, 0, td); if (lvp != NULLVP) VOP_UNLOCK(lvp, 0, td); } if (lvp != NULLVP) vdrop(lvp); vdrop(uvp); return (vop_stdlock(ap)); } if (error != 0 && lvp != NULLVP) { VI_UNLOCK(vp); if ((revlock & LK_TYPE_MASK) == LK_RELEASE) VOP_UNLOCK(lvp, revlock, td); else - vn_lock(lvp, revlock | LK_RETRY, td); + vn_lock(lvp, revlock | LK_RETRY); goto unionfs_lock_abort; } } VI_UNLOCK(vp); unionfs_lock_abort: if (lvp != NULLVP) vdrop(lvp); if (uhold != 0) vdrop(uvp); return (error); unionfs_lock_null_vnode: ap->a_flags |= LK_INTERLOCK; return (vop_stdlock(ap)); } static int unionfs_unlock(struct vop_unlock_args *ap) { int error; int flags; int mtxlkflag; int uhold; struct vnode *vp; struct vnode *lvp; struct vnode *uvp; struct unionfs_node *unp; error = 0; mtxlkflag = 0; uhold = 0; flags = ap->a_flags | LK_RELEASE; vp = ap->a_vp; if ((flags & LK_INTERLOCK) != 0) mtxlkflag = 1; else if (mtx_owned(VI_MTX(vp)) == 0) { VI_LOCK(vp); mtxlkflag = 2; } unp = VTOUNIONFS(vp); if (unp == NULL) goto unionfs_unlock_null_vnode; lvp = unp->un_lowervp; uvp = unp->un_uppervp; if (lvp != NULLVP) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(lvp); VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_UNLOCK(lvp, flags, ap->a_td); VI_LOCK(vp); } if (error == 0 && uvp != NULLVP) { VI_LOCK_FLAGS(uvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(uvp); uhold = 1; VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_UNLOCK(uvp, flags, ap->a_td); VI_LOCK(vp); } VI_UNLOCK(vp); if (lvp != NULLVP) vdrop(lvp); if (uhold != 0) vdrop(uvp); if (mtxlkflag == 0) VI_LOCK(vp); return error; unionfs_unlock_null_vnode: if (mtxlkflag == 2) VI_UNLOCK(vp); return (vop_stdunlock(ap)); } static int unionfs_pathconf(struct vop_pathconf_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); return (VOP_PATHCONF(vp, ap->a_name, ap->a_retval)); } static int unionfs_advlock(struct vop_advlock_args *ap) { int error; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *vp; struct vnode *uvp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_advlock: enter\n"); vp = ap->a_vp; td = curthread; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; if (uvp == NULLVP) { error = unionfs_copyfile(unp, 1, td->td_ucred, td); if (error != 0) goto unionfs_advlock_abort; uvp = unp->un_uppervp; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0) { /* try reopen the vnode */ error = VOP_OPEN(uvp, unsp->uns_lower_openmode, td->td_ucred, td, NULL); if (error) goto unionfs_advlock_abort; unsp->uns_upper_opencnt++; VOP_CLOSE(unp->un_lowervp, unsp->uns_lower_openmode, td->td_ucred, td); unsp->uns_lower_opencnt--; } else unionfs_tryrem_node_status(unp, td, unsp); } VOP_UNLOCK(vp, 0, td); error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; unionfs_advlock_abort: VOP_UNLOCK(vp, 0, td); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; } static int unionfs_strategy(struct vop_strategy_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); #ifdef DIAGNOSTIC if (vp == NULLVP) panic("unionfs_strategy: nullvp"); if (ap->a_bp->b_iocmd == BIO_WRITE && vp == unp->un_lowervp) panic("unionfs_strategy: writing to lowervp"); #endif return (VOP_STRATEGY(vp, ap->a_bp)); } static int unionfs_getacl(struct vop_getacl_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: enter\n"); error = VOP_GETACL(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: leave (%d)\n", error); return (error); } static int unionfs_setacl(struct vop_setacl_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_setacl: enter\n"); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error); return (error); } static int unionfs_aclcheck(struct vop_aclcheck_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: enter\n"); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_ACLCHECK(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: leave (%d)\n", error); return (error); } static int unionfs_openextattr(struct vop_openextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); if ((tvp == unp->un_uppervp && (unp->un_flag & UNIONFS_OPENEXTU)) || (tvp == unp->un_lowervp && (unp->un_flag & UNIONFS_OPENEXTL))) return (EBUSY); error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td); if (error == 0) { - vn_lock(vp, LK_UPGRADE | LK_RETRY, ap->a_td); + vn_lock(vp, LK_UPGRADE | LK_RETRY); if (tvp == unp->un_uppervp) unp->un_flag |= UNIONFS_OPENEXTU; else unp->un_flag |= UNIONFS_OPENEXTL; - vn_lock(vp, LK_DOWNGRADE | LK_RETRY, ap->a_td); + vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } return (error); } static int unionfs_closeextattr(struct vop_closeextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) tvp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) tvp = unp->un_lowervp; if (tvp == NULLVP) return (EOPNOTSUPP); error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td); if (error == 0) { - vn_lock(vp, LK_UPGRADE | LK_RETRY, ap->a_td); + vn_lock(vp, LK_UPGRADE | LK_RETRY); if (tvp == unp->un_uppervp) unp->un_flag &= ~UNIONFS_OPENEXTU; else unp->un_flag &= ~UNIONFS_OPENEXTL; - vn_lock(vp, LK_DOWNGRADE | LK_RETRY, ap->a_td); + vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } return (error); } static int unionfs_getextattr(struct vop_getextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_GETEXTATTR(vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_setextattr(struct vop_setextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { unionfs_setextattr_reopen: if ((unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_setextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_setextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_uio, cred, td); unionfs_setextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error); return (error); } static int unionfs_listextattr(struct vop_listextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_LISTEXTATTR(vp, ap->a_attrnamespace, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_deleteextattr(struct vop_deleteextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { unionfs_deleteextattr_reopen: if ((unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_deleteextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_deleteextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_DELETEEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); unionfs_deleteextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: leave (%d)\n", error); return (error); } static int unionfs_setlabel(struct vop_setlabel_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: enter\n"); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETLABEL(uvp, ap->a_label, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: leave (%d)\n", error); return (error); } static int unionfs_vptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } struct vop_vector unionfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = unionfs_access, .vop_aclcheck = unionfs_aclcheck, .vop_advlock = unionfs_advlock, .vop_bmap = VOP_EOPNOTSUPP, .vop_cachedlookup = unionfs_lookup, .vop_close = unionfs_close, .vop_closeextattr = unionfs_closeextattr, .vop_create = unionfs_create, .vop_deleteextattr = unionfs_deleteextattr, .vop_fsync = unionfs_fsync, .vop_getacl = unionfs_getacl, .vop_getattr = unionfs_getattr, .vop_getextattr = unionfs_getextattr, .vop_getwritemount = unionfs_getwritemount, .vop_inactive = unionfs_inactive, .vop_ioctl = unionfs_ioctl, .vop_lease = unionfs_lease, .vop_link = unionfs_link, .vop_listextattr = unionfs_listextattr, .vop_lock1 = unionfs_lock, .vop_lookup = vfs_cache_lookup, .vop_mkdir = unionfs_mkdir, .vop_mknod = unionfs_mknod, .vop_open = unionfs_open, .vop_openextattr = unionfs_openextattr, .vop_pathconf = unionfs_pathconf, .vop_poll = unionfs_poll, .vop_print = unionfs_print, .vop_read = unionfs_read, .vop_readdir = unionfs_readdir, .vop_readlink = unionfs_readlink, .vop_reclaim = unionfs_reclaim, .vop_remove = unionfs_remove, .vop_rename = unionfs_rename, .vop_rmdir = unionfs_rmdir, .vop_setacl = unionfs_setacl, .vop_setattr = unionfs_setattr, .vop_setextattr = unionfs_setextattr, .vop_setlabel = unionfs_setlabel, .vop_strategy = unionfs_strategy, .vop_symlink = unionfs_symlink, .vop_unlock = unionfs_unlock, .vop_whiteout = unionfs_whiteout, .vop_write = unionfs_write, .vop_vptofh = unionfs_vptofh, }; Index: head/sys/gnu/fs/ext2fs/ext2_lookup.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_lookup.c (revision 175201) +++ head/sys/gnu/fs/ext2fs/ext2_lookup.c (revision 175202) @@ -1,1082 +1,1082 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.6 (Berkeley) 4/1/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DIAGNOSTIC static int dirchk = 1; #else static int dirchk = 0; #endif static SYSCTL_NODE(_vfs, OID_AUTO, e2fs, CTLFLAG_RD, 0, "EXT2FS filesystem"); SYSCTL_INT(_vfs_e2fs, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); /* DIRBLKSIZE in ffs is DEV_BSIZE (in most cases 512) while it is the native blocksize in ext2fs - thus, a #define is no longer appropriate */ #undef DIRBLKSIZ static u_char ext2_ft_to_dt[] = { DT_UNKNOWN, /* EXT2_FT_UNKNOWN */ DT_REG, /* EXT2_FT_REG_FILE */ DT_DIR, /* EXT2_FT_DIR */ DT_CHR, /* EXT2_FT_CHRDEV */ DT_BLK, /* EXT2_FT_BLKDEV */ DT_FIFO, /* EXT2_FT_FIFO */ DT_SOCK, /* EXT2_FT_SOCK */ DT_LNK, /* EXT2_FT_SYMLINK */ }; #define FTTODT(ft) \ ((ft) > sizeof(ext2_ft_to_dt) / sizeof(ext2_ft_to_dt[0]) ? \ DT_UNKNOWN : ext2_ft_to_dt[(ft)]) static u_char dt_to_ext2_ft[] = { EXT2_FT_UNKNOWN, /* DT_UNKNOWN */ EXT2_FT_FIFO, /* DT_FIFO */ EXT2_FT_CHRDEV, /* DT_CHR */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_DIR, /* DT_DIR */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_BLKDEV, /* DT_BLK */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_REG_FILE, /* DT_REG */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_SYMLINK, /* DT_LNK */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_SOCK, /* DT_SOCK */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_UNKNOWN, /* DT_WHT */ }; #define DTTOFT(dt) \ ((dt) > sizeof(dt_to_ext2_ft) / sizeof(dt_to_ext2_ft[0]) ? \ EXT2_FT_UNKNOWN : dt_to_ext2_ft[(dt)]) static int ext2_dirbadentry(struct vnode *dp, struct ext2_dir_entry_2 *de, int entryoffsetinblock); /* * Vnode op for reading directories. * * The routine below assumes that the on-disk format of a directory * is the same as that defined by . If the on-disk * format changes, then it will be necessary to do a conversion * from the on-disk format that read returns to the format defined * by . */ /* * this is exactly what we do here - the problem is that the conversion * will blow up some entries by four bytes, so it can't be done in place. * This is too bad. Right now the conversion is done entry by entry, the * converted entry is sent via uiomove. * * XXX allocate a buffer, convert as many entries as possible, then send * the whole buffer to uiomove */ int ext2_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { struct uio *uio = ap->a_uio; int count, error; struct ext2_dir_entry_2 *edp, *dp; int ncookies; struct dirent dstdp; struct uio auio; struct iovec aiov; caddr_t dirbuf; int DIRBLKSIZ = VTOI(ap->a_vp)->i_e2fs->s_blocksize; int readcnt; off_t startoffset = uio->uio_offset; count = uio->uio_resid; /* * Avoid complications for partial directory entries by adjusting * the i/o to end at a block boundary. Don't give up (like ufs * does) if the initial adjustment gives a negative count, since * many callers don't supply a large enough buffer. The correct * size is a little larger than DIRBLKSIZ to allow for expansion * of directory entries, but some callers just use 512. */ count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); if (count <= 0) count += DIRBLKSIZ; #ifdef EXT2FS_DEBUG printf("ext2_readdir: uio_offset = %lld, uio_resid = %d, count = %d\n", uio->uio_offset, uio->uio_resid, count); #endif auio = *uio; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = count; auio.uio_segflg = UIO_SYSSPACE; aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct ext2_dir_entry_2 *)&dirbuf[readcnt]; ncookies = 0; bzero(&dstdp, offsetof(struct dirent, d_name)); for (dp = (struct ext2_dir_entry_2 *)dirbuf; !error && uio->uio_resid > 0 && dp < edp; ) { /*- * "New" ext2fs directory entries differ in 3 ways * from ufs on-disk ones: * - the name is not necessarily NUL-terminated. * - the file type field always exists and always * follows the name length field. * - the file type is encoded in a different way. * * "Old" ext2fs directory entries need no special * conversions, since they are binary compatible * with "new" entries having a file type of 0 (i.e., * EXT2_FT_UNKNOWN). Splitting the old name length * field didn't make a mess like it did in ufs, * because ext2fs uses a machine-independent disk * layout. */ dstdp.d_fileno = dp->inode; dstdp.d_type = FTTODT(dp->file_type); dstdp.d_namlen = dp->name_len; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct dirent, d_name) - dstdp.d_namlen); if (dp->rec_len > 0) { if(dstdp.d_reclen <= uio->uio_resid) { /* advance dp */ dp = (struct ext2_dir_entry_2 *) ((char *)dp + dp->rec_len); error = uiomove(&dstdp, dstdp.d_reclen, uio); if (!error) ncookies++; } else break; } else { error = EIO; break; } } /* we need to correct uio_offset */ uio->uio_offset = startoffset + (caddr_t)dp - dirbuf; if (!error && ap->a_ncookies != NULL) { u_long *cookiep, *cookies, *ecookies; off_t off; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ext2_readdir: unexpected uio from NFS server"); MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); off = startoffset; for (dp = (struct ext2_dir_entry_2 *)dirbuf, cookiep = cookies, ecookies = cookies + ncookies; cookiep < ecookies; dp = (struct ext2_dir_entry_2 *)((caddr_t) dp + dp->rec_len)) { off += dp->rec_len; *cookiep++ = (u_long) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } } FREE(dirbuf, M_TEMP); if (ap->a_eofflag) *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; return (error); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the file system is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * Overall outline of ext2_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ext2_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *vdp; /* vnode for directory being searched */ struct inode *dp; /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct ext2_dir_entry_2 *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ enum {NONE, COMPACT, FOUND} slotstatus; doff_t slotoffset; /* offset of area with free space */ int slotsize; /* size of area at slotoffset */ int slotfreespace; /* amount of space free in slot */ int slotneeded; /* size of the entry we're seeking */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* prev entry dp->i_offset */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ int namlen, error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; struct thread *td = cnp->cn_thread; ino_t saved_ino; int DIRBLKSIZ = VTOI(ap->a_dvp)->i_e2fs->s_blocksize; bp = NULL; slotoffset = -1; *vpp = NULL; vdp = ap->a_dvp; dp = VTOI(vdp); /* * We now have a segment name to search for, and a directory to search. */ /* * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ slotstatus = FOUND; slotfreespace = slotsize = slotneeded = 0; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slotstatus = NONE; slotneeded = EXT2_DIR_REC_LEN(cnp->cn_namelen); /* was slotneeded = (sizeof(struct direct) - MAXNAMLEN + cnp->cn_namelen + 3) &~ 3; */ } /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; if (nameiop != LOOKUP || dp->i_diroff == 0 || dp->i_diroff > dp->i_size) { entryoffsetinblock = 0; dp->i_offset = 0; numdirpasses = 1; } else { dp->i_offset = dp->i_diroff; if ((entryoffsetinblock = dp->i_offset & bmask) && (error = ext2_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } prevoff = dp->i_offset; endsearch = roundup(dp->i_size, DIRBLKSIZ); enduseful = 0; searchloop: while (dp->i_offset < endsearch) { /* * If necessary, get the next directory block. */ if ((dp->i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = ext2_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp)) != 0) return (error); entryoffsetinblock = 0; } /* * If still looking for a slot, and at a DIRBLKSIZE * boundary, have to start looking for free space again. */ if (slotstatus == NONE && (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { slotoffset = -1; slotfreespace = 0; } /* * Get pointer to next entry. * Full validation checks are slow, so we only check * enough to insure forward progress through the * directory. Complete checks can be run by setting * "vfs.e2fs.dirchk" to be true. */ ep = (struct ext2_dir_entry_2 *) ((char *)bp->b_data + entryoffsetinblock); if (ep->rec_len == 0 || (dirchk && ext2_dirbadentry(vdp, ep, entryoffsetinblock))) { int i; ext2_dirbad(dp, dp->i_offset, "mangled entry"); i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); dp->i_offset += i; entryoffsetinblock += i; continue; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (slotstatus != FOUND) { int size = ep->rec_len; if (ep->inode != 0) size -= EXT2_DIR_REC_LEN(ep->name_len); if (size > 0) { if (size >= slotneeded) { slotstatus = FOUND; slotoffset = dp->i_offset; slotsize = ep->rec_len; } else if (slotstatus == NONE) { slotfreespace += size; if (slotoffset == -1) slotoffset = dp->i_offset; if (slotfreespace >= slotneeded) { slotstatus = COMPACT; slotsize = dp->i_offset + ep->rec_len - slotoffset; } } } } /* * Check for a name match. */ if (ep->inode) { namlen = ep->name_len; if (namlen == cnp->cn_namelen && !bcmp(cnp->cn_nameptr, ep->name, (unsigned)namlen)) { /* * Save directory entry's inode number and * reclen in ndp->ni_ufs area, and release * directory buffer. */ dp->i_ino = ep->inode; dp->i_reclen = ep->rec_len; goto found; } } prevoff = dp->i_offset; dp->i_offset += ep->rec_len; entryoffsetinblock += ep->rec_len; if (ep->inode) enduseful = dp->i_offset; } /* notfound: */ /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; dp->i_offset = 0; endsearch = dp->i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN) && dp->i_nlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. */ if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0) return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, * then set dp->i_count to 0 indicating * that the new slot belongs at the end of the * directory. If we found a slot, then the new entry * can be put in the range from dp->i_offset to * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { dp->i_offset = roundup(dp->i_size, DIRBLKSIZ); dp->i_count = 0; enduseful = dp->i_offset; } else { dp->i_offset = slotoffset; dp->i_count = slotsize; if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } dp->i_endoff = roundup(enduseful, DIRBLKSIZ); dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * The pathname buffer is saved so that the name * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(vdp, *vpp, cnp); return (ENOENT); found: if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Check that directory length properly reflects presence * of this entry. */ if (entryoffsetinblock + EXT2_DIR_REC_LEN(ep->name_len) > dp->i_size) { ext2_dirbad(dp, dp->i_offset, "i_size too small"); dp->i_size = entryoffsetinblock+EXT2_DIR_REC_LEN(ep->name_len); dp->i_flag |= IN_CHANGE | IN_UPDATE; } brelse(bp); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { /* * Write access to directory required to delete files. */ if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0) return (error); /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there * is a previous entry in this block) in dp->i_count. * Save directory inode pointer in ndp->ni_dvp for dirremove(). */ if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; if (dp->i_number == dp->i_ino) { VREF(vdp); *vpp = vdp; return (0); } if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ if ((dp->i_mode & ISVTX) && cred->cr_uid != 0 && cred->cr_uid != dp->i_uid && VTOI(tdp)->i_uid != cred->cr_uid) { vput(tdp); return (EPERM); } *vpp = tdp; return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ if (dp->i_number == dp->i_ino) return (EISDIR); if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); *vpp = tdp; cnp->cn_flags |= SAVENAME; return (0); } /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the file system has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { saved_ino = dp->i_ino; VOP_UNLOCK(pdp, 0, td); /* race to get the inode */ error = VFS_VGET(vdp->v_mount, saved_ino, LK_EXCLUSIVE, &tdp); - vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) return (error); *vpp = tdp; } else if (dp->i_number == dp->i_ino) { VREF(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } void ext2_dirbad(ip, offset, how) struct inode *ip; doff_t offset; char *how; { struct mount *mp; mp = ITOV(ip)->v_mount; (void)printf("%s: bad dir ino %lu at offset %ld: %s\n", mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how); if ((mp->mnt_flag & MNT_RDONLY) == 0) panic("ext2_dirbad: bad dir"); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than MAXNAMLEN * name must be as long as advertised, and null terminated */ /* * changed so that it confirms to ext2_check_dir_entry */ static int ext2_dirbadentry(dp, de, entryoffsetinblock) struct vnode *dp; struct ext2_dir_entry_2 *de; int entryoffsetinblock; { int DIRBLKSIZ = VTOI(dp)->i_e2fs->s_blocksize; char * error_msg = NULL; if (de->rec_len < EXT2_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; else if (de->rec_len % 4 != 0) error_msg = "rec_len % 4 != 0"; else if (de->rec_len < EXT2_DIR_REC_LEN(de->name_len)) error_msg = "reclen is too small for name_len"; else if (entryoffsetinblock + de->rec_len > DIRBLKSIZ) error_msg = "directory entry across blocks"; /* else LATER if (de->inode > dir->i_sb->u.ext2_sb.s_es->s_inodes_count) error_msg = "inode out of bounds"; */ if (error_msg != NULL) { printf("bad directory entry: %s\n", error_msg); printf("offset=%d, inode=%lu, rec_len=%u, name_len=%u\n", entryoffsetinblock, (unsigned long)de->inode, de->rec_len, de->name_len); } return error_msg == NULL ? 0 : 1; } /* * Write a directory entry after a call to namei, using the parameters * that it left in nameidata. The argument ip is the inode which the new * directory entry will refer to. Dvp is a pointer to the directory to * be written, which was left locked by namei. Remaining parameters * (dp->i_offset, dp->i_count) indicate how the space for the new * entry is to be obtained. */ int ext2_direnter(ip, dvp, cnp) struct inode *ip; struct vnode *dvp; struct componentname *cnp; { struct ext2_dir_entry_2 *ep, *nep; struct inode *dp; struct buf *bp; struct ext2_dir_entry_2 newdir; struct iovec aiov; struct uio auio; u_int dsize; int error, loc, newentrysize, spacefree; char *dirbuf; int DIRBLKSIZ = ip->i_e2fs->s_blocksize; #ifdef DIAGNOSTIC if ((cnp->cn_flags & SAVENAME) == 0) panic("direnter: missing name"); #endif dp = VTOI(dvp); newdir.inode = ip->i_number; newdir.name_len = cnp->cn_namelen; if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es, EXT2_FEATURE_INCOMPAT_FILETYPE)) newdir.file_type = DTTOFT(IFTODT(ip->i_mode)); else newdir.file_type = EXT2_FT_UNKNOWN; bcopy(cnp->cn_nameptr, newdir.name, (unsigned)cnp->cn_namelen + 1); newentrysize = EXT2_DIR_REC_LEN(newdir.name_len); if (dp->i_count == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (dp->i_offset & (DIRBLKSIZ - 1)) panic("ext2_direnter: newblk"); auio.uio_offset = dp->i_offset; newdir.rec_len = DIRBLKSIZ; auio.uio_resid = newentrysize; aiov.iov_len = newentrysize; aiov.iov_base = (caddr_t)&newdir; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = (struct thread *)0; error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred); if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) /* XXX should grow with balloc() */ panic("ext2_direnter: frag size"); else if (!error) { dp->i_size = roundup(dp->i_size, DIRBLKSIZ); dp->i_flag |= IN_CHANGE; } return (error); } /* * If dp->i_count is non-zero, then namei found space * for the new entry in the range dp->i_offset to * dp->i_offset + dp->i_count in the directory. * To use this space, we may have to compact the entries located * there, by copying them together towards the beginning of the * block, leaving the free space in one usable chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZE. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (dp->i_offset + dp->i_count > dp->i_size) dp->i_size = dp->i_offset + dp->i_count; /* * Get the block containing the space for the new directory entry. */ if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf, &bp)) != 0) return (error); /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region dp->i_offset to * dp->i_offset + dp->i_count would yield the * space. */ ep = (struct ext2_dir_entry_2 *)dirbuf; dsize = EXT2_DIR_REC_LEN(ep->name_len); spacefree = ep->rec_len - dsize; for (loc = ep->rec_len; loc < dp->i_count; ) { nep = (struct ext2_dir_entry_2 *)(dirbuf + loc); if (ep->inode) { /* trim the existing slot */ ep->rec_len = dsize; ep = (struct ext2_dir_entry_2 *)((char *)ep + dsize); } else { /* overwrite; nothing there; header is ours */ spacefree += dsize; } dsize = EXT2_DIR_REC_LEN(nep->name_len); spacefree += nep->rec_len - dsize; loc += nep->rec_len; bcopy((caddr_t)nep, (caddr_t)ep, dsize); } /* * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ if (ep->inode == 0) { if (spacefree + dsize < newentrysize) panic("ext2_direnter: compact1"); newdir.rec_len = spacefree + dsize; } else { if (spacefree < newentrysize) panic("ext2_direnter: compact2"); newdir.rec_len = spacefree; ep->rec_len = dsize; ep = (struct ext2_dir_entry_2 *)((char *)ep + dsize); } bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize); error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; if (!error && dp->i_endoff && dp->i_endoff < dp->i_size) error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC, cnp->cn_cred, cnp->cn_thread); return (error); } /* * Remove a directory entry after a call to namei, using * the parameters which it left in nameidata. The entry * dp->i_offset contains the offset into the directory of the * entry to be eliminated. The dp->i_count field contains the * size of the previous record in the directory. If this * is 0, the first entry is being deleted, so we need only * zero the inode number to mark the entry as free. If the * entry is not the first in the directory, we must reclaim * the space of the now empty record by adding the record size * to the size of the previous entry. */ int ext2_dirremove(dvp, cnp) struct vnode *dvp; struct componentname *cnp; { struct inode *dp; struct ext2_dir_entry_2 *ep; struct buf *bp; int error; dp = VTOI(dvp); if (dp->i_count == 0) { /* * First entry in block: set d_ino to zero. */ if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) return (error); ep->inode = 0; error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Collapse new free space into previous entry. */ if ((error = ext2_blkatoff(dvp, (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) return (error); ep->rec_len += dp->i_reclen; error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Rewrite an existing directory entry to point at the inode * supplied. The parameters describing the directory entry are * set up by a call to namei. */ int ext2_dirrewrite(dp, ip, cnp) struct inode *dp, *ip; struct componentname *cnp; { struct buf *bp; struct ext2_dir_entry_2 *ep; struct vnode *vdp = ITOV(dp); int error; if ((error = ext2_blkatoff(vdp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) return (error); ep->inode = ip->i_number; if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es, EXT2_FEATURE_INCOMPAT_FILETYPE)) ep->file_type = DTTOFT(IFTODT(ip->i_mode)); else ep->file_type = EXT2_FT_UNKNOWN; error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ext2_dirempty(ip, parentino, cred) struct inode *ip; ino_t parentino; struct ucred *cred; { off_t off; struct dirtemplate dbuf; struct ext2_dir_entry_2 *dp = (struct ext2_dir_entry_2 *)&dbuf; int error, count, namlen; #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += dp->rec_len) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, &count, (struct thread *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->rec_len == 0) return (0); /* skip empty entries */ if (dp->inode == 0) continue; /* accept only "." and ".." */ namlen = dp->name_len; if (namlen > 2) return (0); if (dp->name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1) continue; if (dp->name[1] == '.' && dp->inode == parentino) continue; return (0); } return (1); } /* * Check if source directory is in the path of the target directory. * Target is supplied locked, source is unlocked. * The target is always vput before returning. */ int ext2_checkpath(source, target, cred) struct inode *source, *target; struct ucred *cred; { struct vnode *vp; int error, rootino, namlen; struct dirtemplate dirbuf; vp = ITOV(target); if (target->i_number == source->i_number) { error = EEXIST; goto out; } rootino = ROOTINO; error = 0; if (target->i_number == rootino) goto out; for (;;) { if (vp->v_type != VDIR) { error = ENOTDIR; break; } error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, (int *)0, (struct thread *)0); if (error != 0) break; namlen = dirbuf.dotdot_type; /* like ufs little-endian */ if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') { error = ENOTDIR; break; } if (dirbuf.dotdot_ino == source->i_number) { error = EINVAL; break; } if (dirbuf.dotdot_ino == rootino) break; vput(vp); if ((error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino, LK_EXCLUSIVE, &vp)) != 0) { vp = NULL; break; } } out: if (error == ENOTDIR) printf("checkpath: .. not a directory\n"); if (vp != NULL) vput(vp); return (error); } Index: head/sys/gnu/fs/ext2fs/ext2_vfsops.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_vfsops.c (revision 175201) +++ head/sys/gnu/fs/ext2fs/ext2_vfsops.c (revision 175202) @@ -1,1157 +1,1157 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 * $FreeBSD$ */ /*- * COPYRIGHT.INFO says this has some GPL'd code from ext2_super.c in it * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td); static int ext2_mountfs(struct vnode *, struct mount *, struct thread *); static int ext2_reload(struct mount *mp, struct thread *td); static int ext2_sbupdate(struct ext2mount *, int); static vfs_unmount_t ext2_unmount; static vfs_root_t ext2_root; static vfs_statfs_t ext2_statfs; static vfs_sync_t ext2_sync; static vfs_vget_t ext2_vget; static vfs_fhtovp_t ext2_fhtovp; static vfs_mount_t ext2_mount; MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part"); static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure"); static struct vfsops ext2fs_vfsops = { .vfs_fhtovp = ext2_fhtovp, .vfs_mount = ext2_mount, .vfs_root = ext2_root, /* root inode via vget */ .vfs_statfs = ext2_statfs, .vfs_sync = ext2_sync, .vfs_unmount = ext2_unmount, .vfs_vget = ext2_vget, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); #define bsd_malloc malloc #define bsd_free free static int ext2_check_sb_compat(struct ext2_super_block *es, struct cdev *dev, int ronly); static int compute_sb_data(struct vnode * devvp, struct ext2_super_block * es, struct ext2_sb_info * fs); static const char *ext2_opts[] = { "from", "export", "acls", "exec", "noatime", "union", "suiddir", "multilabel", "nosymfollow", "noclusterr", "noclusterw", "force", NULL }; /* * VFS Operations. * * mount system call */ static int ext2_mount(mp, td) struct mount *mp; struct thread *td; { struct vfsoptlist *opts; struct vnode *devvp; struct ext2mount *ump = 0; struct ext2_sb_info *fs; char *path, *fspec; int error, flags, len; mode_t accessmode; struct nameidata nd, *ndp = &nd; opts = mp->mnt_optnew; if (vfs_filteropt(opts, ext2_opts)) return (EINVAL); vfs_getopt(opts, "fspath", (void **)&path, NULL); /* Double-check the length of path.. */ if (strlen(path) >= MAXMNTLEN - 1) return (ENAMETOOLONG); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOEXT2(mp); fs = ump->um_e2fs; error = 0; if (fs->s_rd_only == 0 && vfs_flagopt(opts, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT, td); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp, LK_NOWAIT, 0, td)) return (EBUSY); error = ext2_flushfiles(mp, flags, td); vfs_unbusy(mp, td); if (!error && fs->s_wasvalid) { fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } fs->s_rd_only = 1; vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, td); if (error) return (error); devvp = ump->um_devvp; if (fs->s_rd_only && !vfs_flagopt(opts, "ro", NULL, 0)) { if (ext2_check_sb_compat(fs->s_es, devvp->v_rdev, 0)) return (EPERM); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); DROP_GIANT(); g_topology_lock(); error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((fs->s_es->s_state & EXT2_VALID_FS) == 0 || (fs->s_es->s_state & EXT2_ERROR_FS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } fs->s_es->s_state &= ~EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); fs->s_rd_only = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } if (vfs_flagopt(opts, "export", NULL, 0)) { /* Process export requests in vfs_mount.c. */ return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * XXXRW: VOP_ACCESS() enough? */ accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = ext2_mountfs(devvp, mp, td); } else { if (devvp != ump->um_devvp) { vput(devvp); return (EINVAL); /* needs translation */ } else vput(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOEXT2(mp); fs = ump->um_e2fs; /* * Note that this strncpy() is ok because of a check at the start * of ext2_mount(). */ strncpy(fs->fs_fsmnt, path, MAXMNTLEN); fs->fs_fsmnt[MAXMNTLEN - 1] = '\0'; vfs_mountedfrom(mp, fspec); return (0); } /* * checks that the data in the descriptor blocks make sense * this is taken from ext2/super.c */ static int ext2_check_descriptors (struct ext2_sb_info * sb) { int i; int desc_block = 0; unsigned long block = sb->s_es->s_first_data_block; struct ext2_group_desc * gdp = NULL; /* ext2_debug ("Checking group descriptors"); */ for (i = 0; i < sb->s_groups_count; i++) { /* examine next descriptor block */ if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext2_group_desc *) sb->s_group_desc[desc_block++]->b_data; if (gdp->bg_block_bitmap < block || gdp->bg_block_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Block bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_block_bitmap); return 0; } if (gdp->bg_inode_bitmap < block || gdp->bg_inode_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_bitmap); return 0; } if (gdp->bg_inode_table < block || gdp->bg_inode_table + sb->s_itb_per_group >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode table for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_table); return 0; } block += EXT2_BLOCKS_PER_GROUP(sb); gdp++; } return 1; } static int ext2_check_sb_compat(es, dev, ronly) struct ext2_super_block *es; struct cdev *dev; int ronly; { if (es->s_magic != EXT2_SUPER_MAGIC) { printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n", devtoname(dev), es->s_magic, EXT2_SUPER_MAGIC); return (1); } if (es->s_rev_level > EXT2_GOOD_OLD_REV) { if (es->s_feature_incompat & ~EXT2_FEATURE_INCOMPAT_SUPP) { printf( "WARNING: mount of %s denied due to unsupported optional features\n", devtoname(dev)); return (1); } if (!ronly && (es->s_feature_ro_compat & ~EXT2_FEATURE_RO_COMPAT_SUPP)) { printf( "WARNING: R/W mount of %s denied due to unsupported optional features\n", devtoname(dev)); return (1); } } return (0); } /* * this computes the fields of the ext2_sb_info structure from the * data in the ext2_super_block structure read in */ static int compute_sb_data(devvp, es, fs) struct vnode * devvp; struct ext2_super_block * es; struct ext2_sb_info * fs; { int db_count, error; int i, j; int logic_sb_block = 1; /* XXX for now */ #if 1 #define V(v) #else #define V(v) printf(#v"= %d\n", fs->v); #endif fs->s_blocksize = EXT2_MIN_BLOCK_SIZE << es->s_log_block_size; V(s_blocksize) fs->s_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->s_log_block_size; V(s_bshift) fs->s_fsbtodb = es->s_log_block_size + 1; V(s_fsbtodb) fs->s_qbmask = fs->s_blocksize - 1; V(s_bmask) fs->s_blocksize_bits = EXT2_BLOCK_SIZE_BITS(es); V(s_blocksize_bits) fs->s_frag_size = EXT2_MIN_FRAG_SIZE << es->s_log_frag_size; V(s_frag_size) if (fs->s_frag_size) fs->s_frags_per_block = fs->s_blocksize / fs->s_frag_size; V(s_frags_per_block) fs->s_blocks_per_group = es->s_blocks_per_group; V(s_blocks_per_group) fs->s_frags_per_group = es->s_frags_per_group; V(s_frags_per_group) fs->s_inodes_per_group = es->s_inodes_per_group; V(s_inodes_per_group) fs->s_inodes_per_block = fs->s_blocksize / EXT2_INODE_SIZE; V(s_inodes_per_block) fs->s_itb_per_group = fs->s_inodes_per_group /fs->s_inodes_per_block; V(s_itb_per_group) fs->s_desc_per_block = fs->s_blocksize / sizeof (struct ext2_group_desc); V(s_desc_per_block) /* s_resuid / s_resgid ? */ fs->s_groups_count = (es->s_blocks_count - es->s_first_data_block + EXT2_BLOCKS_PER_GROUP(fs) - 1) / EXT2_BLOCKS_PER_GROUP(fs); V(s_groups_count) db_count = (fs->s_groups_count + EXT2_DESC_PER_BLOCK(fs) - 1) / EXT2_DESC_PER_BLOCK(fs); fs->s_db_per_group = db_count; V(s_db_per_group) fs->s_group_desc = bsd_malloc(db_count * sizeof (struct buf *), M_EXT2MNT, M_WAITOK); /* adjust logic_sb_block */ if(fs->s_blocksize > SBSIZE) /* Godmar thinks: if the blocksize is greater than 1024, then the superblock is logically part of block zero. */ logic_sb_block = 0; for (i = 0; i < db_count; i++) { error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1), fs->s_blocksize, NOCRED, &fs->s_group_desc[i]); if(error) { for (j = 0; j < i; j++) brelse(fs->s_group_desc[j]); bsd_free(fs->s_group_desc, M_EXT2MNT); printf("EXT2-fs: unable to read group descriptors (%d)\n", error); return EIO; } LCK_BUF(fs->s_group_desc[i]) } if(!ext2_check_descriptors(fs)) { for (j = 0; j < db_count; j++) ULCK_BUF(fs->s_group_desc[j]) bsd_free(fs->s_group_desc, M_EXT2MNT); printf("EXT2-fs: (ext2_check_descriptors failure) " "unable to read group descriptors\n"); return EIO; } for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) { fs->s_inode_bitmap_number[i] = 0; fs->s_inode_bitmap[i] = NULL; fs->s_block_bitmap_number[i] = 0; fs->s_block_bitmap[i] = NULL; } fs->s_loaded_inode_bitmaps = 0; fs->s_loaded_block_bitmaps = 0; if (es->s_rev_level == EXT2_GOOD_OLD_REV || (es->s_feature_ro_compat & EXT2_FEATURE_RO_COMPAT_LARGE_FILE) == 0) fs->fs_maxfilesize = 0x7fffffff; else fs->fs_maxfilesize = 0x7fffffffffffffff; return 0; } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ext2_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct ext2_super_block * es; struct ext2_sb_info *fs; int error; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOEXT2(mp)->um_devvp; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, td, 0, 0) != 0) panic("ext2_reload: dirty1"); VOP_UNLOCK(devvp, 0, td); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2_super_block *)bp->b_data; if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOEXT2(mp)->um_e2fs; bcopy(bp->b_data, fs->s_es, sizeof(struct ext2_super_block)); if((error = compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return error; } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); loop: MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, td, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->s_blocksize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_VNODE_FOREACH_ABORT(mp, mvp); return (error); } ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number)), ip); brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Common code for mount and mountroot */ static int ext2_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ext2mount *ump; struct buf *bp; struct ext2_sb_info *fs; struct ext2_super_block * es; struct cdev *dev = devvp->v_rdev; struct g_consumer *cp; struct bufobj *bo; int error; int ronly; ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0); /* XXX: use VOP_ACESS to check FS perms */ DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); /* XXX: should we check for some sectorsize or 512 instead? */ if (((SBSIZE % cp->provider->sectorsize) != 0) || (SBSIZE < cp->provider->sectorsize)) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); return (EINVAL); } bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2_super_block *)bp->b_data; if (ext2_check_sb_compat(es, dev, ronly) != 0) { error = EINVAL; /* XXX needs translation */ goto out; } if ((es->s_state & EXT2_VALID_FS) == 0 || (es->s_state & EXT2_ERROR_FS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = bsd_malloc(sizeof *ump, M_EXT2MNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); /* I don't know whether this is the right strategy. Note that we dynamically allocate both an ext2_sb_info and an ext2_super_block while Linux keeps the super block in a locked buffer */ ump->um_e2fs = bsd_malloc(sizeof(struct ext2_sb_info), M_EXT2MNT, M_WAITOK); ump->um_e2fs->s_es = bsd_malloc(sizeof(struct ext2_super_block), M_EXT2MNT, M_WAITOK); bcopy(es, ump->um_e2fs->s_es, (u_int)sizeof(struct ext2_super_block)); if ((error = compute_sb_data(devvp, ump->um_e2fs->s_es, ump->um_e2fs))) goto out; /* * We don't free the group descriptors allocated by compute_sb_data() * until ext2_unmount(). This is OK since the mount will succeed. */ brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->s_rd_only = ronly; /* ronly is set according to mnt_flags */ /* if the fs is not mounted read-only, make sure the super block is always written back on a sync() */ fs->s_wasvalid = fs->s_es->s_state & EXT2_VALID_FS ? 1 : 0; if (ronly == 0) { fs->s_dirt = 1; /* mark it modified */ fs->s_es->s_state &= ~EXT2_VALID_FS; /* set fs invalid */ } mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_bo = &devvp->v_bufobj; ump->um_cp = cp; /* setting those two parameters allowed us to use ufs_bmap w/o changse ! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = fs->s_es->s_log_block_size + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { bsd_free(ump->um_e2fs->s_es, M_EXT2MNT); bsd_free(ump->um_e2fs, M_EXT2MNT); bsd_free(ump, M_EXT2MNT); mp->mnt_data = NULL; } return (error); } /* * unmount system call */ static int ext2_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ext2mount *ump; struct ext2_sb_info *fs; int error, flags, ronly, i; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, td)) != 0) return (error); ump = VFSTOEXT2(mp); fs = ump->um_e2fs; ronly = fs->s_rd_only; if (ronly == 0) { if (fs->s_wasvalid) fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } /* release buffers containing group descriptors */ for(i = 0; i < fs->s_db_per_group; i++) ULCK_BUF(fs->s_group_desc[i]) bsd_free(fs->s_group_desc, M_EXT2MNT); /* release cached inode/block bitmaps */ for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_inode_bitmap[i]) ULCK_BUF(fs->s_inode_bitmap[i]) for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_block_bitmap[i]) ULCK_BUF(fs->s_block_bitmap[i]) DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); bsd_free(fs->s_es, M_EXT2MNT); bsd_free(fs, M_EXT2MNT); bsd_free(ump, M_EXT2MNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { int error; error = vflush(mp, 0, flags, td); return (error); } /* * Get file system statistics. * taken from ext2/super.c ext2_statfs */ static int ext2_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { unsigned long overhead; struct ext2mount *ump; struct ext2_sb_info *fs; struct ext2_super_block *es; int i, nsb; ump = VFSTOEXT2(mp); fs = ump->um_e2fs; es = fs->s_es; if (es->s_magic != EXT2_SUPER_MAGIC) panic("ext2_statfs - magic number spoiled"); /* * Compute the overhead (FS structures) */ if (es->s_feature_ro_compat & EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER) { nsb = 0; for (i = 0 ; i < fs->s_groups_count; i++) if (ext2_group_sparse(i)) nsb++; } else nsb = fs->s_groups_count; overhead = es->s_first_data_block + /* Superblocks and block group descriptors: */ nsb * (1 + fs->s_db_per_group) + /* Inode bitmap, block bitmap, and inode table: */ fs->s_groups_count * (1 + 1 + fs->s_itb_per_group); sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = es->s_blocks_count - overhead; sbp->f_bfree = es->s_free_blocks_count; sbp->f_bavail = sbp->f_bfree - es->s_r_blocks_count; sbp->f_files = es->s_inodes_count; sbp->f_ffree = es->s_free_inodes_count; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { struct vnode *mvp, *vp; struct inode *ip; struct ext2mount *ump = VFSTOEXT2(mp); struct ext2_sb_info *fs; int error, allerror = 0; fs = ump->um_e2fs; if (fs->s_dirt != 0 && fs->s_rd_only != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ext2_sync: rofs mod"); } /* * Write back each (modified) inode. */ MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { MNT_ILOCK(mp); if (error == ENOENT) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } continue; } if ((error = VOP_FSYNC(vp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { - vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0, td); } /* * Write back modified superblock. */ if (fs->s_dirt != 0) { fs->s_dirt = 0; fs->s_es->s_wtime = time_second; if ((error = ext2_sbupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { struct ext2_sb_info *fs; struct inode *ip; struct ext2mount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int i, error; int used_blocks; struct thread *td; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); ump = VFSTOEXT2(mp); dev = ump->um_dev; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ext2_sync() if a sync happens to fire right then, * which will cause a panic because ext2_sync() blindly * dereferences vp->v_data (as well it should). */ ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) { *vpp = NULL; free(ip, M_EXT2NODE); return (error); } vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td); error = insmntque(vp, mp); if (error != 0) { free(ip, M_EXT2NODE); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ #if 0 printf("ext2_vget(%d) dbn= %d ", ino, fsbtodb(fs, ino_to_fsba(fs, ino))); #endif if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->s_blocksize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vput(vp); brelse(bp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ino)), ip); ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; ip->i_prealloc_count = 0; ip->i_prealloc_block = 0; /* now we want to make sure that block pointers for unused blocks are zeroed out - ext2_balloc depends on this although for regular files and directories only */ if(S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode)) { used_blocks = (ip->i_size+fs->s_blocksize-1) / fs->s_blocksize; for(i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } /* ext2_print_inode(ip); */ brelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct inode *ip; struct ufid *ufhp; struct vnode *nvp; struct ext2_sb_info *fs; int error; ufhp = (struct ufid *)fhp; fs = VFSTOEXT2(mp)->um_e2fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino > fs->s_groups_count * fs->s_es->s_inodes_per_group) return (ESTALE); error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(mp, waitfor) struct ext2mount *mp; int waitfor; { struct ext2_sb_info *fs = mp->um_e2fs; struct ext2_super_block *es = fs->s_es; struct buf *bp; int error = 0; /* printf("\nupdating superblock, waitfor=%s\n", waitfor == MNT_WAIT ? "yes":"no"); */ bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2_super_block)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here */ return (error); } /* * Return the root of a filesystem. */ static int ext2_root(mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { struct vnode *nvp; int error; error = VFS_VGET(mp, (ino_t)ROOTINO, LK_EXCLUSIVE, &nvp); if (error) return (error); *vpp = nvp; return (0); } Index: head/sys/gnu/fs/ext2fs/ext2_vnops.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_vnops.c (revision 175201) +++ head/sys/gnu/fs/ext2fs/ext2_vnops.c (revision 175202) @@ -1,1775 +1,1775 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.7 (Berkeley) 2/3/94 * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_suiddir.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); static vop_access_t ext2_access; static vop_advlock_t ext2_advlock; static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ext2_close; static vop_create_t ext2_create; static vop_fsync_t ext2_fsync; static vop_getattr_t ext2_getattr; static vop_kqfilter_t ext2_kqfilter; static vop_link_t ext2_link; static vop_mkdir_t ext2_mkdir; static vop_mknod_t ext2_mknod; static vop_open_t ext2_open; static vop_pathconf_t ext2_pathconf; static vop_print_t ext2_print; static vop_read_t ext2_read; static vop_readlink_t ext2_readlink; static vop_remove_t ext2_remove; static vop_rename_t ext2_rename; static vop_rmdir_t ext2_rmdir; static vop_setattr_t ext2_setattr; static vop_strategy_t ext2_strategy; static vop_symlink_t ext2_symlink; static vop_write_t ext2_write; static vop_vptofh_t ext2_vptofh; static vop_close_t ext2fifo_close; static vop_kqfilter_t ext2fifo_kqfilter; static int filt_ext2read(struct knote *kn, long hint); static int filt_ext2write(struct knote *kn, long hint); static int filt_ext2vnode(struct knote *kn, long hint); static void filt_ext2detach(struct knote *kn); /* Global vfs data structures for ext2. */ struct vop_vector ext2_vnodeops = { .vop_default = &default_vnodeops, .vop_access = ext2_access, .vop_advlock = ext2_advlock, .vop_bmap = ext2_bmap, .vop_cachedlookup = ext2_lookup, .vop_close = ext2_close, .vop_create = ext2_create, .vop_fsync = ext2_fsync, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_link = ext2_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = ext2_mkdir, .vop_mknod = ext2_mknod, .vop_open = ext2_open, .vop_pathconf = ext2_pathconf, .vop_poll = vop_stdpoll, .vop_kqfilter = ext2_kqfilter, .vop_print = ext2_print, .vop_read = ext2_read, .vop_readdir = ext2_readdir, .vop_readlink = ext2_readlink, .vop_reallocblks = ext2_reallocblks, .vop_reclaim = ext2_reclaim, .vop_remove = ext2_remove, .vop_rename = ext2_rename, .vop_rmdir = ext2_rmdir, .vop_setattr = ext2_setattr, .vop_strategy = ext2_strategy, .vop_symlink = ext2_symlink, .vop_write = ext2_write, .vop_vptofh = ext2_vptofh, }; struct vop_vector ext2_fifoops = { .vop_default = &fifo_specops, .vop_access = ext2_access, .vop_close = ext2fifo_close, .vop_fsync = ext2_fsync, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_kqfilter = ext2fifo_kqfilter, .vop_print = ext2_print, .vop_read = VOP_PANIC, .vop_reclaim = ext2_reclaim, .vop_setattr = ext2_setattr, .vop_write = VOP_PANIC, .vop_vptofh = ext2_vptofh, }; #include /* * A virgin directory (no blushing please). * Note that the type and namlen fields are reversed relative to ext2. * Also, we don't use `struct odirtemplate', since it would just cause * endianness problems. */ static struct dirtemplate mastertemplate = { 0, 12, 1, EXT2_FT_DIR, ".", 0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".." }; static struct dirtemplate omastertemplate = { 0, 12, 1, EXT2_FT_UNKNOWN, ".", 0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".." }; void ext2_itimes(vp) struct vnode *vp; { struct inode *ip; struct timespec ts; ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } /* * Create a regular file */ static int ext2_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error) return (error); return (0); } static int ext2_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ static int ext2_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes(vp); VI_UNLOCK(vp); return (0); } static int ext2_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); mode_t mode = ap->a_mode; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) return (EOPNOTSUPP); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); return (error); } static int ext2_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; ext2_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_devvp->v_rdev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = ip->i_atimensec; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = ip->i_mtimensec; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = ip->i_ctimensec; vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ext2_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to unset system flags, or * modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } ip->i_flags = vap->va_flags; } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || (vap->va_flags & UF_SETTABLE) != vap->va_flags) return (EPERM); ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); } ip->i_flag |= IN_CHANGE; if (vap->va_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) ip->i_flag |= IN_CHANGE | IN_UPDATE; ext2_itimes(vp); if (vap->va_atime.tv_sec != VNOVAL) { ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } error = ext2_update(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ext2_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ext2_chmod(vp, mode, cred, td) struct vnode *vp; int mode; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { error = priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0); if (error) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID, 0); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ext2_chown(vp, uid, gid, cred, td) struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if (uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0); if (error) return (error); } ogid = ip->i_gid; ouid = ip->i_uid; ip->i_gid = gid; ip->i_uid = uid; ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0) != 0) ip->i_mode &= ~(ISUID | ISGID); } return (0); } /* * Synch an open file. */ /* ARGSUSED */ static int ext2_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { /* * Flush all dirty buffers associated with a vnode. */ ext2_discard_prealloc(VTOI(ap->a_vp)); vop_stdfsync(ap); return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT)); } /* * Mknod vnode call */ /* ARGSUSED */ static int ext2_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ ip->i_rdev = vap->va_rdev; } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } static int ext2_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ext2_dirremove(dvp, ap->a_cnp); if (error == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } /* * link vnode call */ static int ext2_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; int error; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_link: no name"); #endif if (tdvp->v_mount != vp->v_mount) { error = EXDEV; goto out; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_nlink++; ip->i_flag |= IN_CHANGE; error = ext2_update(vp, 1); if (!error) error = ext2_direnter(ip, tdvp, cnp); if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } /* * Rename system call. * See comments in sys/ufs/ufs/ufs_vnops.c */ static int ext2_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; struct inode *ip, *xp, *dp; struct dirtemplate dirbuf; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0; u_char namlen; #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ext2_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. Temporarily just warn if they do. */ if (fvp == tvp) { printf("ext2_rename: fvp == tvp (can't happen)\n"); error = 0; goto abortit; } - if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= LINK_MAX) { VOP_UNLOCK(fvp, 0, td); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0, td); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0, td); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory++; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ip->i_nlink++; ip->i_flag |= IN_CHANGE; if ((error = ext2_update(fvp, 1)) != 0) { VOP_UNLOCK(fvp, 0, td); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp, 0, td); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ext2_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } dp->i_nlink++; dp->i_flag |= IN_CHANGE; error = ext2_update(tdvp, 1); if (error) goto bad; } error = ext2_direnter(ip, tdvp, tcnp); if (error) { if (doingdirectory && newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; (void)ext2_update(tdvp, 1); } goto bad; } vput(tdvp); } else { if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ext2_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { if (! ext2_dirempty(xp, dp->i_number, tcnp->cn_cred) || xp->i_nlink > 2) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ext2_dirrewrite(dp, ip, tcnp); if (error) goto bad; /* * If the target directory is in the same * directory as the source directory, * decrement the link count on the parent * of the target directory. */ if (doingdirectory && !newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is * a directory it is empty and there are * no links to it, so we can squash the inode and * any space associated with it. We disallowed * renaming over top of a directory with links to * it above, as the remaining link would point to * a directory without "." or ".." entries. */ xp->i_nlink--; if (doingdirectory) { if (--xp->i_nlink != 0) panic("ext2_rename: linked directory"); error = ext2_truncate(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, tcnp->cn_thread); } xp->i_flag |= IN_CHANGE; vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. */ if (doingdirectory) panic("ext2_rename: lost dir entry"); vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { if (doingdirectory) panic("ext2_rename: lost dir entry"); } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, (int *)0, (struct thread *)0); if (error == 0) { /* Like ufs little-endian: */ namlen = dirbuf.dotdot_type; if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') { ext2_dirbad(xp, (doff_t)12, "rename: mangled dir"); } else { dirbuf.dotdot_ino = newparent; (void) vn_rdwr(UIO_WRITE, fvp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, (int *)0, (struct thread *)0); cache_purge(fdvp); } } } error = ext2_dirremove(fdvp, fcnp); if (!error) { xp->i_nlink--; xp->i_flag |= IN_CHANGE; } xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE, td) == 0) { + if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } /* * Mkdir system call */ static int ext2_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct dirtemplate dirtemplate, *dtp; int error, dmode; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ext2_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; #ifdef SUIDDIR { /* * if we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ( (dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; error = ext2_update(tvp, 1); /* * Bump link count in parent directory * to reflect work done below. Should * be done before reference is created * so reparation is possible if we crash. */ dp->i_nlink++; dp->i_flag |= IN_CHANGE; error = ext2_update(dvp, 1); if (error) goto bad; /* Initialize directory with "." and ".." from static template. */ if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es, EXT2_FEATURE_INCOMPAT_FILETYPE)) dtp = &mastertemplate; else dtp = &omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; /* note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE * so let's just redefine it - for this function only */ #undef DIRBLKSIZ #define DIRBLKSIZ VTOI(dvp)->i_e2fs->s_blocksize dirtemplate.dotdot_reclen = DIRBLKSIZ - 12; error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate, sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED, (int *)0, (struct thread *)0); if (error) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; goto bad; } if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) /* XXX should grow with balloc() */ panic("ext2_mkdir: blksize"); else { ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE; } /* Directory set up, now install its entry in the parent directory. */ error = ext2_direnter(ip, dvp, cnp); if (error) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } bad: /* * No need to do an explicit VOP_TRUNCATE here, vrele will do this * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } else *ap->a_vpp = tvp; out: return (error); #undef DIRBLKSIZ #define DIRBLKSIZ DEV_BSIZE } /* * Rmdir system call. */ static int ext2_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (ip->i_nlink != 2 || !ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ext2_dirremove(dvp, cnp); if (error) goto out; dp->i_nlink--; dp->i_flag |= IN_CHANGE; cache_purge(dvp); VOP_UNLOCK(dvp, 0, td); /* * Truncate inode. The only stuff left * in the directory is "." and "..". The * "." reference is inconsequential since * we're quashing it. The ".." reference * has already been adjusted above. We've * removed the "." reference and the reference * in the parent directory, but there may be * other hard links so decrement by 2 and * worry about them later. */ ip->i_nlink -= 2; error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred, td); cache_purge(ITOV(ip)); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); out: return (error); } /* * symlink -- make a symbolic link */ static int ext2_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, (int *)0, (struct thread *)0); if (error) vput(vp); return (error); } /* * Return target name of a symbolic link */ static int ext2_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if (isize < vp->v_mount->mnt_maxsymlinklen) { uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ext2_bmaparray() operation may not * deadlock on memory. See ext2_bmap() for details. */ static int ext2_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct inode *ip; struct bufobj *bo; int32_t blkno; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ext2_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = VFSTOEXT2(vp->v_mount)->um_bo; BO_STRATEGY(bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ext2_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); vn_printf(ip->i_devvp, "\tino %lu", (u_long)ip->i_number); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ext2fifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ext2 kqfilter routines if needed */ static int ext2fifo_kqfilter(ap) struct vop_kqfilter_args *ap; { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = ext2_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ext2 filesystems. */ static int ext2_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ static int ext2_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { struct inode *ip = VTOI(ap->a_vp); return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; struct fid *a_fhp; } */ *ap; { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ext2_vinit(mntp, fifoops, vpp) struct mount *mntp; struct vop_vector *fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == ROOTINO) vp->v_vflag |= VV_ROOT; ip->i_modrev = init_va_filerev(); *vpp = vp; return (0); } /* * Allocate a new inode. */ static int ext2_makeinode(mode, dvp, vpp, cnp) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { struct inode *ip, *pdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp); if (error) { return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { /* * if we are * not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ( (dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) { if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID, 0)) ip->i_mode &= ~ISGID; } if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = ext2_update(tvp, 1); if (error) goto bad; error = ext2_direnter(ip, dvp, cnp); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } static struct filterops ext2read_filtops = { 1, NULL, filt_ext2detach, filt_ext2read }; static struct filterops ext2write_filtops = { 1, NULL, filt_ext2detach, filt_ext2write }; static struct filterops ext2vnode_filtops = { 1, NULL, filt_ext2detach, filt_ext2vnode }; static int ext2_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &ext2read_filtops; break; case EVFILT_WRITE: kn->kn_fop = &ext2write_filtops; break; case EVFILT_VNODE: kn->kn_fop = &ext2vnode_filtops; break; default: return (1); } kn->kn_hook = (caddr_t)vp; if (vp->v_pollinfo == NULL) v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return ENOMEM; knlist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); return (0); } static void filt_ext2detach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_ext2read(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct inode *ip = VTOI(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } kn->kn_data = ip->i_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } /*ARGSUSED*/ static int filt_ext2write(struct knote *kn, long hint) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; return (1); } static int filt_ext2vnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } Index: head/sys/gnu/fs/ext2fs/fs.h =================================================================== --- head/sys/gnu/fs/ext2fs/fs.h (revision 175201) +++ head/sys/gnu/fs/ext2fs/fs.h (revision 175202) @@ -1,170 +1,170 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fs.h 8.7 (Berkeley) 4/19/94 * $FreeBSD$ */ /* * Each disk drive contains some number of file systems. * A file system consists of a number of cylinder groups. * Each cylinder group has inodes and data. * * A file system is described by its super-block, which in turn * describes the cylinder groups. The super-block is critical * data and is replicated in each cylinder group to protect against * catastrophic loss. This is done at `newfs' time and the critical * super-block data does not change, so the copies need not be * referenced further unless disaster strikes. * * The first boot and super blocks are given in absolute disk addresses. * The byte-offset forms are preferred, as they don't imply a sector size. */ #define SBSIZE 1024 #define SBLOCK 2 /* * The path name on which the file system is mounted is maintained * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in * the super block for this name. */ #define MAXMNTLEN 512 /* * Macros for access to superblock array structures */ /* * Convert cylinder group to base address of its global summary info. */ #define fs_cs(fs, cgindx) (((struct ext2_group_desc *) \ (fs->s_group_desc[cgindx / EXT2_DESC_PER_BLOCK(fs)]->b_data)) \ [cgindx % EXT2_DESC_PER_BLOCK(fs)]) /* * Turn file system block numbers into disk block addresses. * This maps file system blocks to device size blocks. */ #define fsbtodb(fs, b) ((b) << ((fs)->s_fsbtodb)) #define dbtofsb(fs, b) ((b) >> ((fs)->s_fsbtodb)) /* get group containing inode */ #define ino_to_cg(fs, x) (((x) - 1) / EXT2_INODES_PER_GROUP(fs)) /* get block containing inode from its number x */ #define ino_to_fsba(fs, x) fs_cs(fs, ino_to_cg(fs, x)).bg_inode_table + \ (((x)-1) % EXT2_INODES_PER_GROUP(fs))/EXT2_INODES_PER_BLOCK(fs) /* get offset for inode in block */ #define ino_to_fsbo(fs, x) ((x-1) % EXT2_INODES_PER_BLOCK(fs)) /* * Give cylinder group number for a file system block. * Give cylinder group block number for a file system block. */ #define dtog(fs, d) (((d) - fs->s_es->s_first_data_block) / \ EXT2_BLOCKS_PER_GROUP(fs)) #define dtogd(fs, d) (((d) - fs->s_es->s_first_data_block) % \ EXT2_BLOCKS_PER_GROUP(fs)) /* * The following macros optimize certain frequently calculated * quantities by using shifts and masks in place of divisions * modulos and multiplications. */ #define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ ((loc) & (fs)->s_qbmask) #define lblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ ((blk) << (fs->s_bshift)) #define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ ((loc) >> (fs->s_bshift)) /* no fragments -> logical block number equal # of frags */ #define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ ((loc) >> (fs->s_bshift)) #define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ roundup(size, fs->s_frag_size) /* was (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) */ /* * Determining the size of a file block in the file system. * easy w/o fragments */ #define blksize(fs, ip, lbn) ((fs)->s_frag_size) /* * INOPB is the number of inodes in a secondary storage block. */ #define INOPB(fs) EXT2_INODES_PER_BLOCK(fs) /* * NINDIR is the number of indirects in a file system block. */ #define NINDIR(fs) (EXT2_ADDR_PER_BLOCK(fs)) extern int inside[], around[]; extern u_char *fragtbl[]; /* a few remarks about superblock locking/unlocking * Linux provides special routines for doing so * I haven't figured out yet what BSD does * I think I'll try a VOP_LOCK/VOP_UNLOCK on the device vnode */ #define DEVVP(inode) (VFSTOEXT2(ITOV(inode)->v_mount)->um_devvp) -#define lock_super(devvp) vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, curthread) +#define lock_super(devvp) vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY) #define unlock_super(devvp) VOP_UNLOCK(devvp, 0, curthread) /* * Historically, ext2fs kept it's metadata buffers on the LOCKED queue. Now, * we change the lock owner to kern so that we may use it from contexts other * than the one that originally locked it. When we are finished with the * buffer, we release it, writing it first if it was dirty. */ #define LCK_BUF(bp) { \ (bp)->b_flags |= B_PERSISTENT; \ BUF_KERNPROC(bp); \ } #define ULCK_BUF(bp) { \ long flags; \ flags = (bp)->b_flags; \ (bp)->b_flags &= ~(B_DIRTY | B_PERSISTENT); \ if (flags & B_DIRTY) \ bwrite(bp); \ else \ brelse(bp); \ } Index: head/sys/gnu/fs/reiserfs/reiserfs_namei.c =================================================================== --- head/sys/gnu/fs/reiserfs/reiserfs_namei.c (revision 175201) +++ head/sys/gnu/fs/reiserfs/reiserfs_namei.c (revision 175202) @@ -1,701 +1,701 @@ /*- * Copyright 2000 Hans Reiser * See README for licensing and copyright details * * Ported to FreeBSD by Jean-Sébastien Pédron * * $FreeBSD$ */ #include static int reiserfs_find_entry(struct reiserfs_node *dp, const char *name, int namelen, struct path * path_to_entry, struct reiserfs_dir_entry *de); MALLOC_DEFINE(M_REISERFSCOOKIES, "reiserfs_cookies", "ReiserFS VOP_READDIR cookies"); /* ------------------------------------------------------------------- * Lookup functions * -------------------------------------------------------------------*/ int reiserfs_lookup(struct vop_cachedlookup_args *ap) { int error, retval; struct vnode *vdp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; struct cpu_key *saved_ino; struct vnode *vp; struct vnode *pdp; /* Saved dp during symlink work */ struct reiserfs_node *dp; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); char c = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; reiserfs_log(LOG_DEBUG, "looking for `%s', %ld (%s)\n", cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_pnbuf); cnp->cn_nameptr[cnp->cn_namelen] = c; vp = NULL; dp = VTOI(vdp); if (REISERFS_MAX_NAME(dp->i_reiserfs->s_blocksize) < cnp->cn_namelen) return (ENAMETOOLONG); reiserfs_log(LOG_DEBUG, "searching entry\n"); de.de_gen_number_bit_string = 0; retval = reiserfs_find_entry(dp, cnp->cn_nameptr, cnp->cn_namelen, &path_to_entry, &de); pathrelse(&path_to_entry); if (retval == NAME_FOUND) { reiserfs_log(LOG_DEBUG, "found\n"); } else { reiserfs_log(LOG_DEBUG, "not found\n"); } if (retval == NAME_FOUND) { #if 0 /* Hide the .reiserfs_priv directory */ if (reiserfs_xattrs(dp->i_reiserfs) && !old_format_only(dp->i_reiserfs) && REISERFS_SB(dp->i_reiserfs)->priv_root && REISERFS_SB(dp->i_reiserfs)->priv_root->d_inode && de.de_objectid == le32toh(INODE_PKEY(REISERFS_SB( dp->i_reiserfs)->priv_root->d_inode)->k_objectid)) { return (EACCES); } #endif reiserfs_log(LOG_DEBUG, "reading vnode\n"); pdp = vdp; if (flags & ISDOTDOT) { saved_ino = (struct cpu_key *)&(de.de_dir_id); VOP_UNLOCK(pdp, 0, td); error = reiserfs_iget(vdp->v_mount, saved_ino, &vp, td); - vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) return (error); *vpp = vp; } else if (de.de_objectid == dp->i_number && de.de_dir_id == dp->i_ino) { VREF(vdp); /* We want ourself, ie "." */ *vpp = vdp; } else { if ((error = reiserfs_iget(vdp->v_mount, (struct cpu_key *)&(de.de_dir_id), &vp, td)) != 0) return (error); *vpp = vp; } /* * Propogate the priv_object flag so we know we're in the * priv tree */ /*if (is_reiserfs_priv_object(dir)) REISERFS_I(inode)->i_flags |= i_priv_object;*/ } else { if (retval == IO_ERROR) { reiserfs_log(LOG_DEBUG, "IO error\n"); return (EIO); } return (ENOENT); } /* Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); reiserfs_log(LOG_DEBUG, "done\n"); return (0); } extern struct key MIN_KEY; int reiserfs_readdir(struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */*ap) { int error = 0; struct dirent dstdp; struct uio *uio = ap->a_uio; off_t next_pos; struct buf *bp; struct item_head *ih; struct cpu_key pos_key; const struct key *rkey; struct reiserfs_node *ip; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); int entry_num, item_num, search_res; /* The NFS part */ int ncookies = 0; u_long *cookies = NULL; /* * Form key for search the next directory entry using f_pos field of * file structure */ ip = VTOI(ap->a_vp); make_cpu_key(&pos_key, ip, uio->uio_offset ? uio->uio_offset : DOT_OFFSET, TYPE_DIRENTRY, 3); next_pos = cpu_key_k_offset(&pos_key); reiserfs_log(LOG_DEBUG, "listing entries for " "(objectid=%d, dirid=%d)\n", pos_key.on_disk_key.k_objectid, pos_key.on_disk_key.k_dir_id); reiserfs_log(LOG_DEBUG, "uio_offset = %jd, uio_resid = %d\n", (intmax_t)uio->uio_offset, uio->uio_resid); if (ap->a_ncookies && ap->a_cookies) { cookies = (u_long *)malloc( uio->uio_resid / 16 * sizeof(u_long), M_REISERFSCOOKIES, M_WAITOK); } while (1) { //research: /* * Search the directory item, containing entry with * specified key */ reiserfs_log(LOG_DEBUG, "search directory to read\n"); search_res = search_by_entry_key(ip->i_reiserfs, &pos_key, &path_to_entry, &de); if (search_res == IO_ERROR) { error = EIO; goto out; } entry_num = de.de_entry_num; item_num = de.de_item_num; bp = de.de_bp; ih = de.de_ih; if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT(ih)) { /* * Go through all entries in the directory item * beginning from the entry, that has been found. */ struct reiserfs_de_head *deh = B_I_DEH(bp, ih) + entry_num; if (ap->a_ncookies == NULL) { cookies = NULL; } else { //ncookies = } reiserfs_log(LOG_DEBUG, "walking through directory entries\n"); for (; entry_num < I_ENTRY_COUNT(ih); entry_num++, deh++) { int d_namlen; char *d_name; off_t d_off; ino_t d_ino; if (!de_visible(deh)) { /* It is hidden entry */ continue; } d_namlen = entry_length(bp, ih, entry_num); d_name = B_I_DEH_ENTRY_FILE_NAME(bp, ih, deh); if (!d_name[d_namlen - 1]) d_namlen = strlen(d_name); reiserfs_log(LOG_DEBUG, " - `%s' (len=%d)\n", d_name, d_namlen); if (d_namlen > REISERFS_MAX_NAME( ip->i_reiserfs->s_blocksize)) { /* Too big to send back to VFS */ continue; } #if 0 /* Ignore the .reiserfs_priv entry */ if (reiserfs_xattrs(ip->i_reiserfs) && !old_format_only(ip->i_reiserfs) && filp->f_dentry == ip->i_reiserfs->s_root && REISERFS_SB(ip->i_reiserfs)->priv_root && REISERFS_SB(ip->i_reiserfs)->priv_root->d_inode && deh_objectid(deh) == le32toh(INODE_PKEY(REISERFS_SB( ip->i_reiserfs)->priv_root->d_inode)->k_objectid)) { continue; } #endif d_off = deh_offset(deh); d_ino = deh_objectid(deh); uio->uio_offset = d_off; /* Copy to user land */ dstdp.d_fileno = d_ino; dstdp.d_type = DT_UNKNOWN; dstdp.d_namlen = d_namlen; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct dirent, d_name) - dstdp.d_namlen); if (d_namlen > 0) { if (dstdp.d_reclen <= uio->uio_resid) { reiserfs_log(LOG_DEBUG, " copying to user land\n"); error = uiomove(&dstdp, dstdp.d_reclen, uio); if (error) goto end; if (cookies != NULL) { cookies[ncookies] = d_off; ncookies++; } } else break; } else { error = EIO; break; } next_pos = deh_offset(deh) + 1; } reiserfs_log(LOG_DEBUG, "...done\n"); } reiserfs_log(LOG_DEBUG, "checking item num (%d == %d ?)\n", item_num, B_NR_ITEMS(bp) - 1); if (item_num != B_NR_ITEMS(bp) - 1) { /* End of directory has been reached */ reiserfs_log(LOG_DEBUG, "end reached\n"); if (ap->a_eofflag) *ap->a_eofflag = 1; goto end; } /* * Item we went through is last item of node. Using right * delimiting key check is it directory end */ reiserfs_log(LOG_DEBUG, "get right key\n"); rkey = get_rkey(&path_to_entry, ip->i_reiserfs); reiserfs_log(LOG_DEBUG, "right key = (objectid=%d, dirid=%d)\n", rkey->k_objectid, rkey->k_dir_id); reiserfs_log(LOG_DEBUG, "compare it to MIN_KEY\n"); reiserfs_log(LOG_DEBUG, "MIN KEY = (objectid=%d, dirid=%d)\n", MIN_KEY.k_objectid, MIN_KEY.k_dir_id); if (comp_le_keys(rkey, &MIN_KEY) == 0) { /* Set pos_key to key, that is the smallest and greater * that key of the last entry in the item */ reiserfs_log(LOG_DEBUG, "continuing on the right\n"); set_cpu_key_k_offset(&pos_key, next_pos); continue; } reiserfs_log(LOG_DEBUG, "compare it to pos_key\n"); reiserfs_log(LOG_DEBUG, "pos key = (objectid=%d, dirid=%d)\n", pos_key.on_disk_key.k_objectid, pos_key.on_disk_key.k_dir_id); if (COMP_SHORT_KEYS(rkey, &pos_key)) { /* End of directory has been reached */ reiserfs_log(LOG_DEBUG, "end reached (right)\n"); if (ap->a_eofflag) *ap->a_eofflag = 1; goto end; } /* Directory continues in the right neighboring block */ reiserfs_log(LOG_DEBUG, "continuing with a new offset\n"); set_cpu_key_k_offset(&pos_key, le_key_k_offset(KEY_FORMAT_3_5, rkey)); reiserfs_log(LOG_DEBUG, "new pos key = (objectid=%d, dirid=%d)\n", pos_key.on_disk_key.k_objectid, pos_key.on_disk_key.k_dir_id); } end: uio->uio_offset = next_pos; pathrelse(&path_to_entry); reiserfs_check_path(&path_to_entry); out: if (error && cookies != NULL) { free(cookies, M_REISERFSCOOKIES); } else if (ap->a_ncookies != NULL && ap->a_cookies != NULL) { *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } return (error); } /* ------------------------------------------------------------------- * Functions from linux/fs/reiserfs/namei.c * -------------------------------------------------------------------*/ /* * Directory item contains array of entry headers. This performs binary * search through that array. */ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, off_t off) { struct item_head *ih = de->de_ih; struct reiserfs_de_head *deh = de->de_deh; int rbound, lbound, j; lbound = 0; rbound = I_ENTRY_COUNT(ih) - 1; for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { if (off < deh_offset(deh + j)) { rbound = j - 1; continue; } if (off > deh_offset(deh + j)) { lbound = j + 1; continue; } /* This is not name found, but matched third key component */ de->de_entry_num = j; return (NAME_FOUND); } de->de_entry_num = lbound; return (NAME_NOT_FOUND); } /* * Comment? Maybe something like set de to point to what the path * points to? */ static inline void set_de_item_location(struct reiserfs_dir_entry *de, struct path *path) { de->de_bp = get_last_bp(path); de->de_ih = get_ih(path); de->de_deh = B_I_DEH(de->de_bp, de->de_ih); de->de_item_num = PATH_LAST_POSITION(path); } /* * de_bh, de_ih, de_deh (points to first element of array), de_item_num * is set */ void set_de_name_and_namelen(struct reiserfs_dir_entry *de) { struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; if (de->de_entry_num >= ih_entry_count(de->de_ih)) { reiserfs_log(LOG_DEBUG, "BUG\n"); return; } de->de_entrylen = entry_length(de->de_bp, de->de_ih, de->de_entry_num); de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); de->de_name = B_I_PITEM(de->de_bp, de->de_ih) + deh_location(deh); if (de->de_name[de->de_namelen - 1] == 0) de->de_namelen = strlen(de->de_name); } /* What entry points to */ static inline void set_de_object_key(struct reiserfs_dir_entry *de) { if (de->de_entry_num >= ih_entry_count(de->de_ih)) { reiserfs_log(LOG_DEBUG, "BUG\n"); return; } de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num])); de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num])); } static inline void store_de_entry_key(struct reiserfs_dir_entry *de) { struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; if (de->de_entry_num >= ih_entry_count(de->de_ih)) { reiserfs_log(LOG_DEBUG, "BUG\n"); return; } /* Store key of the found entry */ de->de_entry_key.version = KEY_FORMAT_3_5; de->de_entry_key.on_disk_key.k_dir_id = le32toh(de->de_ih->ih_key.k_dir_id); de->de_entry_key.on_disk_key.k_objectid = le32toh(de->de_ih->ih_key.k_objectid); set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh)); set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY); } /* * We assign a key to each directory item, and place multiple entries in * a single directory item. A directory item has a key equal to the key * of the first directory entry in it. * * This function first calls search_by_key, then, if item whose first * entry matches is not found it looks for the entry inside directory * item found by search_by_key. Fills the path to the entry, and to the * entry position in the item */ int search_by_entry_key(struct reiserfs_sb_info *sbi, const struct cpu_key *key, struct path *path, struct reiserfs_dir_entry *de) { int retval; reiserfs_log(LOG_DEBUG, "searching in (objectid=%d,dirid=%d)\n", key->on_disk_key.k_objectid, key->on_disk_key.k_dir_id); retval = search_item(sbi, key, path); switch (retval) { case ITEM_NOT_FOUND: if (!PATH_LAST_POSITION(path)) { reiserfs_log(LOG_DEBUG, "search_by_key returned item position == 0"); pathrelse(path); return (IO_ERROR); } PATH_LAST_POSITION(path)--; reiserfs_log(LOG_DEBUG, "search_by_key did not found it\n"); break; case ITEM_FOUND: reiserfs_log(LOG_DEBUG, "search_by_key found it\n"); break; case IO_ERROR: return (retval); default: pathrelse(path); reiserfs_log(LOG_DEBUG, "no path to here"); return (IO_ERROR); } reiserfs_log(LOG_DEBUG, "set item location\n"); set_de_item_location(de, path); /* * Binary search in directory item by third component of the * key. Sets de->de_entry_num of de */ reiserfs_log(LOG_DEBUG, "bin_search_in_dir_item\n"); retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); path->pos_in_item = de->de_entry_num; if (retval != NAME_NOT_FOUND) { /* * Ugly, but rename needs de_bp, de_deh, de_name, de_namelen, * de_objectid set */ set_de_name_and_namelen(de); set_de_object_key(de); reiserfs_log(LOG_DEBUG, "set (objectid=%d,dirid=%d)\n", de->de_objectid, de->de_dir_id); } return (retval); } static uint32_t get_third_component(struct reiserfs_sb_info *sbi, const char *name, int len) { uint32_t res; if (!len || (len == 1 && name[0] == '.')) return (DOT_OFFSET); if (len == 2 && name[0] == '.' && name[1] == '.') return (DOT_DOT_OFFSET); res = REISERFS_SB(sbi)->s_hash_function(name, len); /* Take bits from 7-th to 30-th including both bounds */ res = GET_HASH_VALUE(res); if (res == 0) /* * Needed to have no names before "." and ".." those have hash * value == 0 and generation counters 1 and 2 accordingly */ res = 128; return (res + MAX_GENERATION_NUMBER); } static int reiserfs_match(struct reiserfs_dir_entry *de, const char *name, int namelen) { int retval = NAME_NOT_FOUND; if ((namelen == de->de_namelen) && !memcmp(de->de_name, name, de->de_namelen)) retval = (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE); return (retval); } /* * de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already * Used when hash collisions exist */ static int linear_search_in_dir_item(struct cpu_key *key, struct reiserfs_dir_entry *de, const char *name, int namelen) { int i; int retval; struct reiserfs_de_head * deh = de->de_deh; i = de->de_entry_num; if (i == I_ENTRY_COUNT(de->de_ih) || GET_HASH_VALUE(deh_offset(deh + i)) != GET_HASH_VALUE(cpu_key_k_offset(key))) { i--; } /*RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih), "vs-7010: array of entry headers not found");*/ deh += i; for (; i >= 0; i--, deh--) { if (GET_HASH_VALUE(deh_offset(deh)) != GET_HASH_VALUE(cpu_key_k_offset(key))) { /* * Hash value does not match, no need to check * whole name */ reiserfs_log(LOG_DEBUG, "name `%s' not found\n", name); return (NAME_NOT_FOUND); } /* Mark that this generation number is used */ if (de->de_gen_number_bit_string) set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), (unsigned long *)de->de_gen_number_bit_string); /* Calculate pointer to name and namelen */ de->de_entry_num = i; set_de_name_and_namelen(de); if ((retval = reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { /* * de's de_name, de_namelen, de_recordlen are set. * Fill the rest: */ /* key of pointed object */ set_de_object_key(de); store_de_entry_key(de); /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */ reiserfs_log(LOG_DEBUG, "reiserfs_match answered `%d'\n", retval); return (retval); } } if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) /* * We have reached left most entry in the node. In common * we have to go to the left neighbor, but if generation * counter is 0 already, we know for sure, that there is * no name with the same hash value */ /* FIXME: this work correctly only because hash value can * not be 0. Btw, in case of Yura's hash it is probably * possible, so, this is a bug */ return (NAME_NOT_FOUND); /*RFALSE(de->de_item_num, "vs-7015: two diritems of the same directory in one node?");*/ return (GOTO_PREVIOUS_ITEM); } /* * May return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND * FIXME: should add something like IOERROR */ static int reiserfs_find_entry(struct reiserfs_node *dp, const char *name, int namelen, struct path * path_to_entry, struct reiserfs_dir_entry *de) { struct cpu_key key_to_search; int retval; if (namelen > REISERFS_MAX_NAME(dp->i_reiserfs->s_blocksize)) return NAME_NOT_FOUND; /* We will search for this key in the tree */ make_cpu_key(&key_to_search, dp, get_third_component(dp->i_reiserfs, name, namelen), TYPE_DIRENTRY, 3); while (1) { reiserfs_log(LOG_DEBUG, "search by entry key\n"); retval = search_by_entry_key(dp->i_reiserfs, &key_to_search, path_to_entry, de); if (retval == IO_ERROR) { reiserfs_log(LOG_DEBUG, "IO error in %s\n", __FUNCTION__); return IO_ERROR; } /* Compare names for all entries having given hash value */ reiserfs_log(LOG_DEBUG, "linear search for `%s'\n", name); retval = linear_search_in_dir_item(&key_to_search, de, name, namelen); if (retval != GOTO_PREVIOUS_ITEM) { /* * There is no need to scan directory anymore. * Given entry found or does not exist */ reiserfs_log(LOG_DEBUG, "linear search returned " "(objectid=%d,dirid=%d)\n", de->de_objectid, de->de_dir_id); path_to_entry->pos_in_item = de->de_entry_num; return retval; } /* * There is left neighboring item of this directory and * given entry can be there */ set_cpu_key_k_offset(&key_to_search, le_ih_k_offset(de->de_ih) - 1); pathrelse(path_to_entry); } /* while (1) */ } Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c =================================================================== --- head/sys/gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c (revision 175201) +++ head/sys/gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c (revision 175202) @@ -1,419 +1,419 @@ /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. * Copyright (c) 2006 Russell Cattelan Digital Elves, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Further, this software is distributed without any warranty that it is * free of the rightful claim of any third person regarding infringement * or the like. Any license provided herein, whether implied or * otherwise, applies only to this software file. Patent licenses, if * any, provided herein do not apply to combinations of this program with * other software, or any other product whatsoever. * * You should have received a copy of the GNU General Public License along * with this program; if not, write the Free Software Foundation, Inc., 59 * Temple Place - Suite 330, Boston MA 02111-1307, USA. * * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, * Mountain View, CA 94043, or: * * http://www.sgi.com * * For further information regarding this notice, see: * * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ */ #include "xfs.h" #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_alloc_btree.h" #include "xfs_bmap_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_attr_sf.h" #include "xfs_dir_sf.h" #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_vnode.h" static int xfs_vn_allocate(xfs_mount_t *, xfs_inode_t *, struct xfs_vnode **); /* * Look up an inode by number in the given file system. * The inode is looked up in the hash table for the file system * represented by the mount point parameter mp. Each bucket of * the hash table is guarded by an individual semaphore. * * If the inode is found in the hash table, its corresponding vnode * is obtained with a call to vn_get(). This call takes care of * coordination with the reclamation of the inode and vnode. Note * that the vmap structure is filled in while holding the hash lock. * This gives us the state of the inode/vnode when we found it and * is used for coordination in vn_get(). * * If it is not in core, read it in from the file system's device and * add the inode into the hash table. * * The inode is locked according to the value of the lock_flags parameter. * This flag parameter indicates how and if the inode's IO lock and inode lock * should be taken. * * mp -- the mount point structure for the current file system. It points * to the inode hash table. * tp -- a pointer to the current transaction if there is one. This is * simply passed through to the xfs_iread() call. * ino -- the number of the inode desired. This is the unique identifier * within the file system for the inode being requested. * lock_flags -- flags indicating how to lock the inode. See the comment * for xfs_ilock() for a list of valid values. * bno -- the block number starting the buffer containing the inode, * if known (as by bulkstat), else 0. */ int xfs_iget( xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp, xfs_daddr_t bno) { xfs_ihash_t *ih; xfs_inode_t *ip; xfs_inode_t *iq; xfs_vnode_t *vp; ulong version; int error; /* REFERENCED */ int newnode; xfs_chash_t *ch; xfs_chashlist_t *chl, *chlnew; vmap_t vmap; SPLDECL(s); XFS_STATS_INC(xs_ig_attempts); ih = XFS_IHASH(mp, ino); again: read_lock(&ih->ih_lock); for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { if (ip->i_ino == ino) { vp = XFS_ITOV(ip); VMAP(vp, vmap); /* * Inode cache hit: if ip is not at the front of * its hash chain, move it there now. * Do this with the lock held for update, but * do statistics after releasing the lock. */ if (ip->i_prevp != &ih->ih_next && rwlock_trypromote(&ih->ih_lock)) { if ((iq = ip->i_next)) { iq->i_prevp = ip->i_prevp; } *ip->i_prevp = iq; iq = ih->ih_next; iq->i_prevp = &ip->i_next; ip->i_next = iq; ip->i_prevp = &ih->ih_next; ih->ih_next = ip; write_unlock(&ih->ih_lock); } else { read_unlock(&ih->ih_lock); } XFS_STATS_INC(xs_ig_found); /* * Get a reference to the vnode/inode. * vn_get() takes care of coordination with * the file system inode release and reclaim * functions. If it returns NULL, the inode * has been reclaimed so just start the search * over again. We probably won't find it, * but we could be racing with another cpu * looking for the same inode so we have to at * least look. */ if (!(vp = vn_get(vp, &vmap))) { XFS_STATS_INC(xs_ig_frecycle); goto again; } if (lock_flags != 0) { ip->i_flags &= ~XFS_IRECLAIM; xfs_ilock(ip, lock_flags); } newnode = (ip->i_d.di_mode == 0); if (newnode) { xfs_iocore_inode_reinit(ip); } ip->i_flags &= ~XFS_ISTALE; vn_trace_exit(vp, "xfs_iget.found", (inst_t *)__return_address); goto return_ip; } } /* * Inode cache miss: save the hash chain version stamp and unlock * the chain, so we don't deadlock in vn_alloc. */ XFS_STATS_INC(xs_ig_missed); version = ih->ih_version; read_unlock(&ih->ih_lock); /* * Read the disk inode attributes into a new inode structure and get * a new vnode for it. This should also initialize i_ino and i_mount. */ error = xfs_iread(mp, tp, ino, &ip, bno); if (error) { return error; } error = xfs_vn_allocate(mp, ip, &vp); if (error) { return error; } vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); xfs_inode_lock_init(ip, vp); xfs_iocore_inode_init(ip); if (lock_flags != 0) { xfs_ilock(ip, lock_flags); } /* * Put ip on its hash chain, unless someone else hashed a duplicate * after we released the hash lock. */ write_lock(&ih->ih_lock); if (ih->ih_version != version) { for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) { if (iq->i_ino == ino) { write_unlock(&ih->ih_lock); xfs_idestroy(ip); XFS_STATS_INC(xs_ig_dup); goto again; } } } /* * These values _must_ be set before releasing ihlock! */ ip->i_hash = ih; if ((iq = ih->ih_next)) { iq->i_prevp = &ip->i_next; } ip->i_next = iq; ip->i_prevp = &ih->ih_next; ih->ih_next = ip; ip->i_udquot = ip->i_gdquot = NULL; ih->ih_version++; write_unlock(&ih->ih_lock); /* * put ip on its cluster's hash chain */ ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL && ip->i_cnext == NULL); chlnew = NULL; ch = XFS_CHASH(mp, ip->i_blkno); chlredo: s = mutex_spinlock(&ch->ch_lock); for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { if (chl->chl_blkno == ip->i_blkno) { /* insert this inode into the doubly-linked list * where chl points */ if ((iq = chl->chl_ip)) { ip->i_cprev = iq->i_cprev; iq->i_cprev->i_cnext = ip; iq->i_cprev = ip; ip->i_cnext = iq; } else { ip->i_cnext = ip; ip->i_cprev = ip; } chl->chl_ip = ip; ip->i_chash = chl; break; } } /* no hash list found for this block; add a new hash list */ if (chl == NULL) { if (chlnew == NULL) { mutex_spinunlock(&ch->ch_lock, s); ASSERT(xfs_chashlist_zone != NULL); chlnew = (xfs_chashlist_t *) kmem_zone_alloc(xfs_chashlist_zone, KM_SLEEP); ASSERT(chlnew != NULL); goto chlredo; } else { ip->i_cnext = ip; ip->i_cprev = ip; ip->i_chash = chlnew; chlnew->chl_ip = ip; chlnew->chl_blkno = ip->i_blkno; chlnew->chl_next = ch->ch_list; ch->ch_list = chlnew; chlnew = NULL; } } else { if (chlnew != NULL) { kmem_zone_free(xfs_chashlist_zone, chlnew); } } mutex_spinunlock(&ch->ch_lock, s); /* * Link ip to its mount and thread it on the mount's inode list. */ XFS_MOUNT_ILOCK(mp); if ((iq = mp->m_inodes)) { ASSERT(iq->i_mprev->i_mnext == iq); ip->i_mprev = iq->i_mprev; iq->i_mprev->i_mnext = ip; iq->i_mprev = ip; ip->i_mnext = iq; } else { ip->i_mnext = ip; ip->i_mprev = ip; } mp->m_inodes = ip; XFS_MOUNT_IUNLOCK(mp); newnode = 1; return_ip: ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); *ipp = ip; /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. */ XVFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1); return 0; } /* * Special iput for brand-new inodes that are still locked */ void xfs_iput_new(xfs_inode_t *ip, uint lock_flags) { xfs_vnode_t *vp = XFS_ITOV(ip); vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address); printf("xfs_iput_new: ip %p\n",ip); if ((ip->i_d.di_mode == 0)) { ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE)); //vn_mark_bad(vp); printf("xfs_iput_new: ip %p di_mode == 0\n",ip); /* mabe call vgone here? RMC */ } if (lock_flags) xfs_iunlock(ip, lock_flags); ASSERT_VOP_LOCKED(vp->v_vnode, "xfs_iput_new"); vput(vp->v_vnode); } extern struct vop_vector xfs_vnops; static int xfs_vn_allocate(xfs_mount_t *mp, xfs_inode_t *ip, struct xfs_vnode **vpp) { struct vnode *vp; struct xfs_vnode *vdata; int error; /* Use zone allocator here? */ vdata = kmem_zalloc(sizeof(*vdata), KM_SLEEP); error = getnewvnode("xfs", XVFSTOMNT(XFS_MTOVFS(mp)), &xfs_vnops, &vp); if (error) { kmem_free(vdata, sizeof(*vdata)); return (error); } vp->v_vnlock->lk_flags |= LK_CANRECURSE; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(vp, XVFSTOMNT(XFS_MTOVFS(mp))); if (error != 0) { kmem_free(vdata, sizeof(*vdata)); return (error); } vp->v_data = (void *)vdata; vdata->v_number= 0; vdata->v_inode = ip; vdata->v_vfsp = XFS_MTOVFS(mp); vdata->v_vnode = vp; vn_bhv_head_init(VN_BHV_HEAD(vdata), "vnode"); #ifdef CONFIG_XFS_VNODE_TRACING vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); #endif /* CONFIG_XFS_VNODE_TRACING */ vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address); if (error == 0) *vpp = vdata; return (error); } Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_super.c =================================================================== --- head/sys/gnu/fs/xfs/FreeBSD/xfs_super.c (revision 175201) +++ head/sys/gnu/fs/xfs/FreeBSD/xfs_super.c (revision 175202) @@ -1,279 +1,279 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_clnt.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir.h" #include "xfs_dir2.h" #include "xfs_alloc.h" #include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_dir_sf.h" #include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" #include "xfs_cap.h" #include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_version.h" #include "xfs_buf.h" #include #include #include extern struct vop_vector xfs_fifoops; extern struct xfs_vnodeops xfs_vnodeops; __uint64_t xfs_max_file_offset( unsigned int blockshift) { return (OFF_MAX); } void xfs_initialize_vnode( bhv_desc_t *bdp, xfs_vnode_t *xvp, bhv_desc_t *inode_bhv, int unlock) { xfs_inode_t *ip = XFS_BHVTOI(inode_bhv); if (!inode_bhv->bd_vobj) { xvp->v_vfsp = bhvtovfs(bdp); bhv_desc_init(inode_bhv, ip, xvp, &xfs_vnodeops); bhv_insert(VN_BHV_HEAD(xvp), inode_bhv); } /* * XXX: Use VNON as an indication of freshly allocated vnode * which need to be initialized and unlocked. * This is _not_ like the same place in Linux version of * routine. */ if (xvp->v_vnode->v_type != VNON) return; xvp->v_vnode->v_type = IFTOVT(ip->i_d.di_mode); if (xvp->v_vnode->v_type == VFIFO) xvp->v_vnode->v_op = &xfs_fifoops; ASSERT_VOP_LOCKED(xvp->v_vnode, "xfs_initialize_vnode"); /* For new inodes we need to set the ops vectors, * and unlock the inode. */ if (ip->i_d.di_mode != 0 && unlock) VOP_UNLOCK(xvp->v_vnode, 0, curthread); } #if 0 struct vnode * xfs_get_inode( bhv_desc_t *bdp, xfs_ino_t ino, int flags) { return NULL; } #endif /*ARGSUSED*/ int xfs_blkdev_get( xfs_mount_t *mp, const char *name, struct vnode **bdevp) { struct nameidata nd; struct nameidata *ndp = &nd; int error, ronly; struct thread *td; struct vnode *devvp; struct g_consumer *cp; struct g_provider *pp; mode_t accessmode; td = curthread; NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, name, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk(devvp, &error)) { vrele(devvp); return (error); } - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); ronly = ((XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) != 0); accessmode = VREAD; if (!ronly) accessmode |= VWRITE; error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } DROP_GIANT(); g_topology_lock(); /* * XXX: Do not allow more than one consumer to open a device * associated with a particular GEOM provider. * This disables multiple read-only mounts of a device, * but it gets rid of panics in bmemfree() when you try to * mount the same device more than once. * During mounting, XFS does a bread() of the superblock, but does * not brelse() it. A subsequent mount of the same device * will try to bread() the superblock, resulting in a panic in * bremfree(), "buffer not on queue". */ pp = g_dev_getprovider(devvp->v_rdev); if ((pp != NULL) && ((pp->acr | pp->acw | pp->ace ) != 0)) error = EPERM; else error = g_vfs_open(devvp, &cp, "xfs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); if (error) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, td); devvp->v_bufobj.bo_private = cp; devvp->v_bufobj.bo_ops = &xfs_bo_ops; *bdevp = devvp; return (0); } void xfs_blkdev_put( struct vnode *devvp) { struct g_consumer *cp; if (devvp == NULL) return; vinvalbuf(devvp, V_SAVE, curthread, 0, 0); cp = devvp->v_bufobj.bo_private; DROP_GIANT(); g_topology_lock(); g_wither_geom_close(cp->geom, ENXIO); g_topology_unlock(); PICKUP_GIANT(); vrele(devvp); } void xfs_mountfs_check_barriers(xfs_mount_t *mp) { printf("xfs_mountfs_check_barriers NI\n"); } void xfs_flush_inode( xfs_inode_t *ip) { printf("xfs_flush_inode NI\n"); } void xfs_flush_device( xfs_inode_t *ip) { printf("xfs_flush_device NI\n"); xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); } void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { printf("xfs_blkdev_issue_flush NI\n"); } int init_xfs_fs( void ) { static char message[] = XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"; printf(message); vn_init(); xfs_init(); uuid_init(); #ifdef RMC vfs_initdmapi(); #endif vfs_initquota(); return 0; } void exit_xfs_fs(void) { xfs_cleanup(); vfs_exitquota(); #ifdef RMC vfs_exitdmapi(); #endif } Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_vnode.c =================================================================== --- head/sys/gnu/fs/xfs/FreeBSD/xfs_vnode.c (revision 175201) +++ head/sys/gnu/fs/xfs/FreeBSD/xfs_vnode.c (revision 175202) @@ -1,267 +1,267 @@ /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Further, this software is distributed without any warranty that it is * free of the rightful claim of any third person regarding infringement * or the like. Any license provided herein, whether implied or * otherwise, applies only to this software file. Patent licenses, if * any, provided herein do not apply to combinations of this program with * other software, or any other product whatsoever. * * You should have received a copy of the GNU General Public License along * with this program; if not, write the Free Software Foundation, Inc., 59 * Temple Place - Suite 330, Boston MA 02111-1307, USA. * * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, * Mountain View, CA 94043, or: * * http://www.sgi.com * * For further information regarding this notice, see: * * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ */ #include "xfs.h" #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_alloc_btree.h" #include "xfs_bmap_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_btree.h" #include "xfs_imap.h" #include "xfs_alloc.h" #include "xfs_attr_sf.h" #include "xfs_dir_sf.h" #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_ialloc.h" #include "xfs_inode.h" #include "xfs_inode_item.h" void vn_init(void) { } void vn_iowait( struct xfs_vnode *vp) { printf("vn_iowait doing nothing on FreeBSD?\n"); } struct xfs_vnode * vn_initialize( xfs_vnode_t *vp) { XFS_STATS_INC(vn_active); XFS_STATS_INC(vn_alloc); /* Initialize the first behavior and the behavior chain head. */ vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); #ifdef CONFIG_XFS_VNODE_TRACING vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); #endif /* CONFIG_XFS_VNODE_TRACING */ vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address); return vp; } /* * Get a reference on a vnode. Need to drop vnode reference * to accomodate for vhold by VMAP regardless of whether or * not we were able to successfully grab the vnode. */ xfs_vnode_t * vn_get( struct xfs_vnode *xfs_vp, vmap_t *vmap) { struct vnode *vp; int error; XFS_STATS_INC(vn_get); vp = vmap->v_vp; error = vget(vp, 0, curthread); if (error) { vdrop(vp); return (NULL); } vdrop(vp); if (vp->v_data != xfs_vp) { vput(vp); return (NULL); } vn_trace_exit(vp, "vn_get", (inst_t *)__return_address); return xfs_vp; } /* * purge a vnode from the cache * At this point the vnode is guaranteed to have no references (vn_count == 0) * The caller has to make sure that there are no ways someone could * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock). */ void vn_purge(struct xfs_vnode *xfs_vp) { struct vnode *vp; vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address); vp = xfs_vp->v_vnode; - vn_lock(vp, LK_EXCLUSIVE, curthread); + vn_lock(vp, LK_EXCLUSIVE); if (vp->v_holdcnt == 0) vhold(vp); vgone(vp); VOP_UNLOCK(vp, 0, curthread); } void xfs_ichgtime( xfs_inode_t *ip, int flags) { timespec_t tv; vfs_timestamp(&tv); if (flags & XFS_ICHGTIME_MOD) { ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; } if (flags & XFS_ICHGTIME_ACC) { ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec; } if (flags & XFS_ICHGTIME_CHG) { ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; } //printf ("xfs_ichgtime NI\n"); } /* * Bring the atime in the XFS inode uptodate. * Used before logging the inode to disk or when the Linux inode goes away. */ /* * It's unclear if we need this since this is for syncing the linux inode's atime * to the xfs inode's atime. * Since FreeBSD doesn't have atime in the vnode is there anything to really * sync over? * For now just make this a update atime call */ void xfs_synchronize_atime( xfs_inode_t *ip) { #if 0 xfs_vnode_t *vp; #endif timespec_t tv; /* vfs_timestamp looks at the system time accuracy variable */ vfs_timestamp(&tv); #if 0 printf("xfs_synchronize_atime old (%d,%d) new (%d,%ld)\n", ip->i_d.di_atime.t_sec, ip->i_d.di_atime.t_nsec, tv.tv_sec, tv.tv_nsec); #endif ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec; } #ifdef RMC /* * Extracting atime values in various formats */ void vn_atime_to_bstime(struct xfs_vnode *vp, xfs_bstime_t *bs_atime) { bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec; bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec; printf("vn_atime_to_bstime NI\n"); } #endif #ifdef CONFIG_XFS_VNODE_TRACING #define KTRACE_ENTER(vp, vk, s, line, ra) \ ktrace_enter( (vp)->v_trace, \ /* 0 */ (void *)(__psint_t)(vk), \ /* 1 */ (void *)(s), \ /* 2 */ (void *)(__psint_t) line, \ /* 3 */ (void *)(vn_count(vp)), \ /* 4 */ (void *)(ra), \ /* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ /* 6 */ (void *)(__psint_t)smp_processor_id(), \ /* 7 */ (void *)(__psint_t)(current->pid), \ /* 8 */ (void *)__return_address, \ /* 9 */ 0, 0, 0, 0, 0, 0, 0) /* * Vnode tracing code. */ void vn_trace_entry(xfs_vnode_t *vp, char *func, inst_t *ra) { KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra); } void vn_trace_exit(xfs_vnode_t *vp, char *func, inst_t *ra) { KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra); } void vn_trace_hold(xfs_vnode_t *vp, char *file, int line, inst_t *ra) { KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra); } void vn_trace_ref(xfs_vnode_t *vp, char *file, int line, inst_t *ra) { KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra); } void vn_trace_rele(xfs_vnode_t *vp, char *file, int line, inst_t *ra) { KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra); } #endif /* CONFIG_XFS_VNODE_TRACING */ Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_vnops.c =================================================================== --- head/sys/gnu/fs/xfs/FreeBSD/xfs_vnops.c (revision 175201) +++ head/sys/gnu/fs/xfs/FreeBSD/xfs_vnops.c (revision 175202) @@ -1,1699 +1,1699 @@ /* * Copyright (c) 2001, Alexander Kabaev * Copyright (c) 2006, Russell Cattelan Digital Elves Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NO_VFS_MACROS #include "xfs.h" #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_alloc_btree.h" #include "xfs_bmap_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_btree.h" #include "xfs_imap.h" #include "xfs_attr.h" #include "xfs_attr_sf.h" #include "xfs_dir_sf.h" #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_acl.h" #include "xfs_cap.h" #include "xfs_mac.h" #include "xfs_iomap.h" #include "xfs_clnt.h" #include "xfs_mountops.h" /* * Prototypes for XFS vnode operations. */ static vop_access_t _xfs_access; static vop_advlock_t _xfs_advlock; static vop_bmap_t _xfs_bmap; static vop_cachedlookup_t _xfs_cachedlookup; static vop_close_t _xfs_close; static vop_create_t _xfs_create; static vop_deleteextattr_t _xfs_deleteextattr; static vop_fsync_t _xfs_fsync; static vop_getattr_t _xfs_getattr; static vop_getextattr_t _xfs_getextattr; static vop_inactive_t _xfs_inactive; static vop_ioctl_t _xfs_ioctl; static vop_link_t _xfs_link; static vop_listextattr_t _xfs_listextattr; static vop_mkdir_t _xfs_mkdir; static vop_mknod_t _xfs_mknod; static vop_open_t _xfs_open; static vop_read_t _xfs_read; static vop_readdir_t _xfs_readdir; static vop_readlink_t _xfs_readlink; static vop_reclaim_t _xfs_reclaim; static vop_remove_t _xfs_remove; static vop_rename_t _xfs_rename; static vop_rmdir_t _xfs_rmdir; static vop_setattr_t _xfs_setattr; static vop_setextattr_t _xfs_setextattr; static vop_strategy_t _xfs_strategy; static vop_symlink_t _xfs_symlink; static vop_write_t _xfs_write; static vop_vptofh_t _xfs_vptofh; struct vop_vector xfs_vnops = { .vop_default = &default_vnodeops, .vop_access = _xfs_access, .vop_advlock = _xfs_advlock, .vop_bmap = _xfs_bmap, .vop_cachedlookup = _xfs_cachedlookup, .vop_close = _xfs_close, .vop_create = _xfs_create, .vop_deleteextattr = _xfs_deleteextattr, .vop_fsync = _xfs_fsync, .vop_getattr = _xfs_getattr, .vop_getextattr = _xfs_getextattr, .vop_inactive = _xfs_inactive, .vop_ioctl = _xfs_ioctl, .vop_link = _xfs_link, .vop_listextattr = _xfs_listextattr, .vop_lookup = vfs_cache_lookup, .vop_mkdir = _xfs_mkdir, .vop_mknod = _xfs_mknod, .vop_open = _xfs_open, .vop_read = _xfs_read, .vop_readdir = _xfs_readdir, .vop_readlink = _xfs_readlink, .vop_reclaim = _xfs_reclaim, .vop_remove = _xfs_remove, .vop_rename = _xfs_rename, .vop_rmdir = _xfs_rmdir, .vop_setattr = _xfs_setattr, .vop_setextattr = _xfs_setextattr, .vop_strategy = _xfs_strategy, .vop_symlink = _xfs_symlink, .vop_write = _xfs_write, .vop_vptofh = _xfs_vptofh, }; /* * FIFO's specific operations. */ static vop_close_t _xfsfifo_close; static vop_read_t _xfsfifo_read; static vop_kqfilter_t _xfsfifo_kqfilter; static vop_write_t _xfsfifo_write; struct vop_vector xfs_fifoops = { .vop_default = &fifo_specops, .vop_access = _xfs_access, .vop_close = _xfsfifo_close, .vop_fsync = _xfs_fsync, .vop_getattr = _xfs_getattr, .vop_inactive = _xfs_inactive, .vop_kqfilter = _xfsfifo_kqfilter, .vop_read = _xfsfifo_read, .vop_reclaim = _xfs_reclaim, .vop_setattr = _xfs_setattr, .vop_write = _xfsfifo_write, .vop_vptofh = _xfs_vptofh, }; static int _xfs_access( struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap) { int error; XVOP_ACCESS(VPTOXFSVP(ap->a_vp), ap->a_mode, ap->a_cred, error); return (error); } static int _xfs_open( struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; } */ *ap) { int error; XVOP_OPEN(VPTOXFSVP(ap->a_vp), ap->a_cred, error); if (error == 0) vnode_create_vobject(ap->a_vp, 0, ap->a_td); return (error); } static int _xfs_close( struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap) { int error = 0; /* XVOP_CLOSE(VPTOXFSVP(ap->a_vp), NULL, error); */ return (error); } static int _xfs_getattr( struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct mount *mp; xfs_vattr_t va; int error; /* extract the xfs vnode from the private data */ //xfs_vnode_t *xvp = (xfs_vnode_t *)vp->v_data; VATTR_NULL(vap); memset(&va,0,sizeof(xfs_vattr_t)); va.va_mask = XFS_AT_STAT|XFS_AT_GENCOUNT|XFS_AT_XFLAGS; XVOP_GETATTR(VPTOXFSVP(vp), &va, 0, ap->a_cred, error); if (error) return (error); mp = vp->v_mount; vap->va_type = IFTOVT(((xfs_vnode_t *)vp->v_data)->v_inode->i_d.di_mode); vap->va_mode = va.va_mode; vap->va_nlink = va.va_nlink; vap->va_uid = va.va_uid; vap->va_gid = va.va_gid; vap->va_fsid = mp->mnt_stat.f_fsid.val[0]; vap->va_fileid = va.va_nodeid; vap->va_size = va.va_size; vap->va_blocksize = va.va_blocksize; vap->va_atime = va.va_atime; vap->va_mtime = va.va_mtime; vap->va_ctime = va.va_ctime; vap->va_gen = va.va_gen; vap->va_rdev = va.va_rdev; vap->va_bytes = (va.va_nblocks << BBSHIFT); /* XFS now supports devices that have block sizes * other than 512 so BBSHIFT will work for now * but need to get this value from the super block */ /* * Fields with no direct equivalent in XFS * leave initialized by VATTR_NULL */ #if 0 vap->va_filerev = 0; vap->va_birthtime = va.va_ctime; vap->va_vaflags = 0; vap->va_flags = 0; vap->va_spare = 0; #endif return (0); } static int _xfs_setattr( struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; xfs_vattr_t va; int error; /* * Check for unsettable attributes. */ #ifdef RMC if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) return (EINVAL); #endif memset(&va, 0, sizeof(va)); if (vap->va_uid != (uid_t)VNOVAL) { va.va_mask |= XFS_AT_UID; va.va_uid = vap->va_uid; } if (vap->va_gid != (gid_t)VNOVAL) { va.va_mask |= XFS_AT_GID; va.va_gid = vap->va_gid; } if (vap->va_size != VNOVAL) { va.va_mask |= XFS_AT_SIZE; va.va_size = vap->va_size; } if (vap->va_atime.tv_sec != VNOVAL) { va.va_mask |= XFS_AT_ATIME; va.va_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { va.va_mask |= XFS_AT_MTIME; va.va_mtime = vap->va_mtime; } if (vap->va_ctime.tv_sec != VNOVAL) { va.va_mask |= XFS_AT_CTIME; va.va_ctime = vap->va_ctime; } if (vap->va_mode != (mode_t)VNOVAL) { va.va_mask |= XFS_AT_MODE; va.va_mode = vap->va_mode; } XVOP_SETATTR(VPTOXFSVP(vp), &va, 0, ap->a_cred, error); return (error); } static int _xfs_inactive( struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int error; XVOP_INACTIVE(VPTOXFSVP(vp), td->td_ucred, error); return (error); } static int _xfs_read( struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; switch (vp->v_type) { case VREG: break; case VDIR: return (EISDIR); default: return (EPERM); }; XVOP_READ(VPTOXFSVP(vp), uio, ap->a_ioflag, ap->a_cred, error); return error; } int xfs_read_file(xfs_mount_t *mp, xfs_inode_t *ip, struct uio *uio, int ioflag) { xfs_fileoff_t lbn, nextlbn; xfs_fsize_t bytesinfile; long size, xfersize, blkoffset; struct buf *bp; struct vnode *vp; int error, orig_resid; int seqcount; seqcount = ioflag >> IO_SEQSHIFT; orig_resid = uio->uio_resid; if (orig_resid <= 0) return (0); vp = XFS_ITOV(ip)->v_vnode; /* * Ok so we couldn't do it all in one vm trick... * so cycle around trying smaller bites.. */ for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_d.di_size - uio->uio_offset) <= 0) break; lbn = XFS_B_TO_FSBT(mp, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = mp->m_sb.sb_blocksize; blkoffset = XFS_B_FSB_OFFSET(mp, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = mp->m_sb.sb_blocksize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (XFS_FSB_TO_B(mp, nextlbn) >= ip->i_d.di_size ) { /* * Don't do readahead if this is the end of the file. */ error = bread(vp, lbn, size, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_d.di_size, lbn, size, NOCRED, uio->uio_resid, seqcount, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ int nextsize = mp->m_sb.sb_blocksize; error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread(vp, lbn, size, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } /* * otherwise use the general form */ error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; if (ioflag & (IO_VMIO|IO_DIRECT) ) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if (ioflag & (IO_VMIO|IO_DIRECT)) { bp->b_flags |= B_RELBUF; brelse(bp); } else bqrelse(bp); } return (error); } static int _xfs_write(struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; int error; xfs_vnode_t *xvp = (xfs_vnode_t *)vp->v_data; error = xfs_write(xvp->v_bh.bh_first, uio, ioflag, ap->a_cred); if (error < 0) { printf("Xfs_write got error %d\n",error); return -error; } return 0; } int xfs_write_file(xfs_inode_t *xip, struct uio *uio, int ioflag) { struct buf *bp; //struct thread *td; daddr_t lbn; off_t osize = 0; off_t offset= 0; int blkoffset, error, resid, xfersize; int fsblocksize; int seqcount; xfs_iomap_t iomap; int maps = 1; xfs_vnode_t *xvp = XFS_ITOV(xip); struct vnode *vp = xvp->v_vnode; xfs_mount_t *mp = (&xip->i_iocore)->io_mount; seqcount = ioflag >> IO_SEQSHIFT; memset(&iomap,0,sizeof(xfs_iomap_t)); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ #if 0 td = uio->uio_td; if (vp->v_type == VREG && td != NULL) { PROC_LOCK(td->td_proc); if (uio->uio_offset + uio->uio_resid > lim_cur(td->td_proc, RLIMIT_FSIZE)) { psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); } #endif resid = uio->uio_resid; offset = uio->uio_offset; osize = xip->i_d.di_size; /* xfs bmap wants bytes for both offset and size */ XVOP_BMAP(xvp, uio->uio_offset, uio->uio_resid, BMAPI_WRITE|BMAPI_DIRECT, &iomap, &maps, error); if(error) { printf("XVOP_BMAP failed\n"); goto error; } for (error = 0; uio->uio_resid > 0;) { lbn = XFS_B_TO_FSBT(mp, offset); blkoffset = XFS_B_FSB_OFFSET(mp, offset); xfersize = mp->m_sb.sb_blocksize - blkoffset; fsblocksize = mp->m_sb.sb_blocksize; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * getblk sets buf by blkno * bo->bo_bsize * bo_bsize is set from the mnt point fsize * so we call getblk in the case using fsblocks * not basic blocks */ bp = getblk(vp, lbn, fsblocksize, 0, 0, 0); if(!bp) { printf("getblk failed\n"); error = EINVAL; break; } if (!(bp->b_flags & B_CACHE) && fsblocksize > xfersize) vfs_bio_clrbuf(bp); if (offset + xfersize > xip->i_d.di_size) { xip->i_d.di_size = offset + xfersize; vnode_pager_setsize(vp, offset + fsblocksize); } /* move the offset for the next itteration of the loop */ offset += xfersize; error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if ((ioflag & IO_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) /* in ext2fs? */ bp->b_flags |= B_RELBUF; /* force to full direct for now */ bp->b_flags |= B_DIRECT; /* and sync ... the delay path is not pushing data out */ ioflag |= IO_SYNC; if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (0 /* RMC xfersize + blkoffset == fs->s_frag_size */) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, osize, seqcount); } else { bawrite(bp); } } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ #if 0 if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_mode &= ~(ISUID | ISGID); #endif if (error) { if (ioflag & IO_UNIT) { #if 0 (void)ext2_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_td); #endif uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { /* Update the vnode here? */ } error: return error; } static int _xfs_create( struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct thread *td = curthread; struct ucred *credp = td->td_ucred; struct componentname *cnp = ap->a_cnp; xfs_vnode_t *xvp; xfs_vattr_t va; int error; memset(&va, 0, sizeof (va)); va.va_mask |= XFS_AT_MODE; va.va_mode = vap->va_mode; va.va_mask |= XFS_AT_TYPE; va.va_mode |= VTTOIF(vap->va_type); xvp = NULL; XVOP_CREATE(VPTOXFSVP(dvp), cnp, &va, &xvp, credp, error); if (error == 0) { *ap->a_vpp = xvp->v_vnode; VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td); } return (error); } extern int xfs_remove(bhv_desc_t *, bhv_desc_t *, vname_t *, cred_t *); static int _xfs_remove( struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct ucred *credp = td->td_ucred; /* struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; */ int error; if (vp->v_type == VDIR || vp->v_usecount != 1) return (EPERM); error = xfs_remove(VPTOXFSVP(ap->a_dvp)->v_bh.bh_first, VPTOXFSVP(ap->a_vp)->v_bh.bh_first, ap->a_cnp,credp); cache_purge(vp); return error; } static int _xfs_rename( struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap) { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; /* struct componentname *tcnp = ap->a_tcnp; */ /* struct componentname *fcnp = ap->a_fcnp;*/ int error = EPERM; if (error) goto out; /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (tvp && tvp->v_usecount > 1) { error = EBUSY; goto out; } if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); vgone(fvp); if (tvp) vgone(tvp); return (error); } static int _xfs_link( struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap) { xfs_vnode_t *tdvp, *vp; int error; tdvp = VPTOXFSVP(ap->a_tdvp); vp = VPTOXFSVP(ap->a_vp); XVOP_LINK(tdvp, vp, ap->a_cnp, NULL, error); return (error); } static int _xfs_symlink( struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap) { struct thread *td = curthread; struct ucred *credp = td->td_ucred; xfs_vnode_t *xvp; xfs_vattr_t va; int error; memset(&va, 0, sizeof (va)); va.va_mask |= XFS_AT_MODE; va.va_mode = ap->a_vap->va_mode | S_IFLNK; va.va_mask |= XFS_AT_TYPE; XVOP_SYMLINK(VPTOXFSVP(ap->a_dvp), ap->a_cnp, &va, ap->a_target, &xvp, credp, error); if (error == 0) { *ap->a_vpp = xvp->v_vnode; VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td); } return (error); } static int _xfs_mknod( struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct thread *td = curthread; struct ucred *credp = td->td_ucred; struct componentname *cnp = ap->a_cnp; xfs_vnode_t *xvp; xfs_vattr_t va; int error; memset(&va, 0, sizeof (va)); va.va_mask |= XFS_AT_MODE; va.va_mode = vap->va_mode | S_IFIFO; va.va_mask |= XFS_AT_TYPE; va.va_mask |= XFS_AT_RDEV; va.va_rdev = vap->va_rdev; xvp = NULL; XVOP_CREATE(VPTOXFSVP(dvp), cnp, &va, &xvp, credp, error); if (error == 0) { *ap->a_vpp = xvp->v_vnode; VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td); } return (error); } static int _xfs_mkdir( struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct thread *td = curthread; struct ucred *credp = td->td_ucred; struct componentname *cnp = ap->a_cnp; xfs_vnode_t *xvp; xfs_vattr_t va; int error; memset(&va, 0, sizeof (va)); va.va_mask |= XFS_AT_MODE; va.va_mode = vap->va_mode | S_IFDIR; va.va_mask |= XFS_AT_TYPE; xvp = NULL; XVOP_MKDIR(VPTOXFSVP(dvp), cnp, &va, &xvp, credp, error); if (error == 0) { *ap->a_vpp = xvp->v_vnode; VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td); } return (error); } static int _xfs_rmdir( struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; /* struct componentname *cnp = ap->a_cnp; */ int error; if (dvp == vp) return (EINVAL); error = EPERM; return (error); } static int _xfs_readdir( struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; off_t off; int eof = 0; if (vp->v_type != VDIR) return (EPERM); if (ap->a_ncookies) { return (EOPNOTSUPP); } error = 0; while (!eof){ off = (int)uio->uio_offset; XVOP_READDIR(VPTOXFSVP(vp), uio, NULL, &eof, error); if ((uio->uio_offset == off) || error) { break; } } if (ap->a_eofflag) *ap->a_eofflag = (eof != 0); return (error); } static int _xfs_readlink( struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; int error; XVOP_READLINK(VPTOXFSVP(vp), uio, 0, cred, error); return (error); } static int _xfs_fsync( struct vop_fsync_args /* { struct vnode * a_vp; int a_waitfor; struct thread * a_td; } */ *ap) { xfs_vnode_t *vp = VPTOXFSVP(ap->a_vp); int flags = FSYNC_DATA; int error; if (ap->a_waitfor == MNT_WAIT) flags |= FSYNC_WAIT; XVOP_FSYNC(vp, flags, ap->a_td->td_ucred, (xfs_off_t)0, (xfs_off_t)-1, error); return (error); } static int _xfs_bmap( struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap) { xfs_iomap_t iomap; xfs_off_t offset; ssize_t size; struct mount *mp; struct xfs_mount *xmp; struct xfs_vnode *xvp; int error, maxrun, retbm; mp = ap->a_vp->v_mount; xmp = XFS_VFSTOM(MNTTOVFS(mp)); if (ap->a_bop != NULL) *ap->a_bop = &xmp->m_ddev_targp->specvp->v_bufobj; if (ap->a_bnp == NULL) return (0); xvp = VPTOXFSVP(ap->a_vp); retbm = 1; offset = XFS_FSB_TO_B(xmp, ap->a_bn); size = XFS_FSB_TO_B(xmp, 1); XVOP_BMAP(xvp, offset, size, BMAPI_READ, &iomap, &retbm, error); if (error) return (error); if (retbm == 0 || iomap.iomap_bn == IOMAP_DADDR_NULL) { *ap->a_bnp = (daddr_t)-1; if (ap->a_runb) *ap->a_runb = 0; if (ap->a_runp) *ap->a_runp = 0; } else { *ap->a_bnp = iomap.iomap_bn + btodb(iomap.iomap_delta); maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; if (ap->a_runb) { *ap->a_runb = XFS_B_TO_FSB(xmp, iomap.iomap_delta); if (*ap->a_runb > maxrun) *ap->a_runb = maxrun; } if (ap->a_runp) { *ap->a_runp = XFS_B_TO_FSB(xmp, iomap.iomap_bsize - iomap.iomap_delta - size); if (*ap->a_runp > maxrun) *ap->a_runp = maxrun; } } return (0); } static int _xfs_strategy( struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap) { daddr_t blkno; struct buf *bp;; struct bufobj *bo; struct vnode *vp; struct xfs_mount *xmp; int error; bp = ap->a_bp; vp = ap->a_vp; KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)", __func__, ap->a_vp, ap->a_bp->b_vp)); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &blkno, NULL, NULL); bp->b_blkno = blkno; bp->b_iooffset = (blkno << BBSHIFT); if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } xmp = XFS_VFSTOM(MNTTOVFS(vp->v_mount)); bo = &xmp->m_ddev_targp->specvp->v_bufobj; bo->bo_ops->bop_strategy(bo, bp); return (0); } int _xfs_ioctl( struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int fflag; struct ucred *cred; struct thread *a_td; } */ *ap) { /* struct vnode *vp = ap->a_vp; */ /* struct thread *p = ap->a_td; */ /* struct file *fp; */ int error; xfs_vnode_t *xvp = VPTOXFSVP(ap->a_vp); printf("_xfs_ioctl cmd 0x%lx data %p\n",ap->a_command,ap->a_data); // XVOP_IOCTL(xvp,(void *)NULL,(void *)NULL,ap->a_fflag,ap->a_command,ap->a_data,error); error = xfs_ioctl(xvp->v_bh.bh_first,NULL,NULL,ap->a_fflag,ap->a_command,ap->a_data); return error; } int _xfs_advlock( struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap) { /* struct vnode *vp = ap->a_vp;*/ struct flock *fl = ap->a_fl; /* caddr_t id = (caddr_t)1 */ /* ap->a_id */; /* int flags = ap->a_flags; */ off_t start, end, size; int error/* , lkop */; /*KAN: temp */ return (EOPNOTSUPP); size = 0; error = 0; switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: start = fl->l_start; break; case SEEK_END: start = fl->l_start + size; default: return (EINVAL); } if (start < 0) return (EINVAL); if (fl->l_len == 0) end = -1; else { end = start + fl->l_len - 1; if (end < start) return (EINVAL); } #ifdef notyet switch (ap->a_op) { case F_SETLK: error = lf_advlock(ap, &np->n_lockf, size); break; case F_UNLCK: lf_advlock(ap, &np->n_lockf, size); break; case F_GETLK: error = lf_advlock(ap, &np->n_lockf, size); break; default: return (EINVAL); } #endif return (error); } static int _xfs_cachedlookup( struct vop_cachedlookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap) { struct vnode *dvp, *tvp; struct xfs_vnode *cvp; int islastcn; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; struct thread *td = cnp->cn_thread; char *pname = cnp->cn_nameptr; int namelen = cnp->cn_namelen; *vpp = NULL; dvp = ap->a_dvp; islastcn = flags & ISLASTCN; XVOP_LOOKUP(VPTOXFSVP(dvp), cnp, &cvp, 0, NULL, cred, error); if (error == ENOENT) { if ((nameiop == CREATE || nameiop == RENAME || nameiop == DELETE) && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, *vpp, cnp); return (error); } if (error) return (error); tvp = cvp->v_vnode; if (nameiop == DELETE && islastcn) { - if ((error = vn_lock(tvp, LK_EXCLUSIVE, td))) { + if ((error = vn_lock(tvp, LK_EXCLUSIVE))) { vrele(tvp); goto err_out; } *vpp = tvp; /* Directory should be writable for deletes. */ error = VOP_ACCESS(dvp, VWRITE, cred, td); if (error) goto err_out; /* XXXKAN: Permission checks for sticky dirs? */ return (0); } if (nameiop == RENAME && islastcn) { - if ((error = vn_lock(tvp, LK_EXCLUSIVE, td))) { + if ((error = vn_lock(tvp, LK_EXCLUSIVE))) { vrele(tvp); goto err_out; } *vpp = tvp; if ((error = VOP_ACCESS(dvp, VWRITE, cred, td))) goto err_out; return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); - error = vn_lock(tvp, cnp->cn_lkflags, td); + error = vn_lock(tvp, cnp->cn_lkflags); if (error) { - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vrele(tvp); goto err_out; } *vpp = tvp; } else if (namelen == 1 && pname[0] == '.') { *vpp = tvp; KASSERT(tvp == dvp, ("not same directory")); } else { - if ((error = vn_lock(tvp, cnp->cn_lkflags, td))) { + if ((error = vn_lock(tvp, cnp->cn_lkflags))) { vrele(tvp); goto err_out; } *vpp = tvp; } if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (0); err_out: if (*vpp != 0) vput(*vpp); return (error); } static int _xfs_reclaim( struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap) { struct vnode *vp = ap->a_vp; struct xfs_vnode *xfs_vp = VPTOXFSVP(vp); int error; XVOP_RECLAIM(xfs_vp, error); kmem_free(xfs_vp, sizeof(*xfs_vp)); vp->v_data = NULL; return (error); } static int _xfs_kqfilter( struct vop_kqfilter_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct knote *a_kn; } */ *ap) { return (0); } struct xfs_inode * xfs_vtoi(struct xfs_vnode *xvp) { return(XFS_BHVTOI(xvp->v_fbhv)); } /* * Read wrapper for fifos. */ static int _xfsfifo_read( struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap) { int error, resid; struct xfs_inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = fifo_specops.vop_read(ap); ip = xfs_vtoi(VPTOXFSVP(ap->a_vp)); if ((ap->a_vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) xfs_ichgtime(ip, XFS_ICHGTIME_ACC); return (error); } /* * Write wrapper for fifos. */ static int _xfsfifo_write( struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap) { int error, resid; struct uio *uio; struct xfs_inode *ip; uio = ap->a_uio; resid = uio->uio_resid; error = fifo_specops.vop_write(ap); ip = xfs_vtoi(VPTOXFSVP(ap->a_vp)); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); return (error); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int _xfsfifo_close( struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap) { return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ufs kqfilter routines if needed */ static int _xfsfifo_kqfilter( struct vop_kqfilter_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct knote *a_kn; } */ *ap) { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = _xfs_kqfilter(ap); return (error); } static int _xfs_getextattr( struct vop_getextattr_args /* { struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; } */ *ap) { int error; char *value; int size; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); size = ATTR_MAX_VALUELEN; value = (char *)kmem_zalloc(size, KM_SLEEP); if (value == NULL) return (ENOMEM); XVOP_ATTR_GET(VPTOXFSVP(ap->a_vp), ap->a_name, value, &size, 1, ap->a_cred, error); if (ap->a_uio != NULL) { if (ap->a_uio->uio_iov->iov_len < size) error = ERANGE; else uiomove(value, size, ap->a_uio); } if (ap->a_size != NULL) *ap->a_size = size; kmem_free(value, ATTR_MAX_VALUELEN); return (error); } static int _xfs_listextattr( struct vop_listextattr_args /* { struct vnode *a_vp; int a_attrnamespace; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; } */ *ap) { int error; char *buf = NULL; int buf_len = 0; attrlist_cursor_kern_t cursor = { 0 }; int i; char name_len; int attrnames_len = 0; int xfs_flags = ATTR_KERNAMELS; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_attrnamespace & EXTATTR_NAMESPACE_USER) xfs_flags |= ATTR_KERNORMALS; if (ap->a_attrnamespace & EXTATTR_NAMESPACE_SYSTEM) xfs_flags |= ATTR_KERNROOTLS; if (ap->a_uio == NULL || ap->a_uio->uio_iov[0].iov_base == NULL) { xfs_flags |= ATTR_KERNOVAL; buf_len = 0; } else { buf = ap->a_uio->uio_iov[0].iov_base; buf_len = ap->a_uio->uio_iov[0].iov_len; } XVOP_ATTR_LIST(VPTOXFSVP(ap->a_vp), buf, buf_len, xfs_flags, &cursor, ap->a_cred, error); if (error < 0) { attrnames_len = -error; error = 0; } if (buf == NULL) goto done; /* * extattr_list expects a list of names. Each list * entry consists of one byte for the name length, followed * by the name (not null terminated) */ name_len=0; for(i=attrnames_len-1; i > 0 ; --i) { buf[i] = buf[i-1]; if (buf[i]) ++name_len; else { buf[i] = name_len; name_len = 0; } } buf[0] = name_len; if (ap->a_uio != NULL) ap->a_uio->uio_resid -= attrnames_len; done: if (ap->a_size != NULL) *ap->a_size = attrnames_len; return (error); } static int _xfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { char *val; size_t vallen; int error, xfs_flags; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (ap->a_uio == NULL) return (EINVAL); vallen = ap->a_uio->uio_resid; if (vallen > ATTR_MAX_VALUELEN) return (EOVERFLOW); if (ap->a_name[0] == '\0') return (EINVAL); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); xfs_flags = 0; if (ap->a_attrnamespace & EXTATTR_NAMESPACE_USER) xfs_flags |= ATTR_KERNORMALS; if (ap->a_attrnamespace & EXTATTR_NAMESPACE_SYSTEM) xfs_flags |= ATTR_KERNROOTLS; val = (char *)kmem_zalloc(vallen, KM_SLEEP); if (val == NULL) return (ENOMEM); error = uiomove(val, (int)vallen, ap->a_uio); if (error) goto err_out; XVOP_ATTR_SET(VPTOXFSVP(ap->a_vp), ap->a_name, val, vallen, xfs_flags, ap->a_cred, error); err_out: kmem_free(val, vallen); return(error); } static int _xfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { int error, xfs_flags; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (ap->a_name[0] == '\0') return (EINVAL); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); xfs_flags = 0; if (ap->a_attrnamespace & EXTATTR_NAMESPACE_USER) xfs_flags |= ATTR_KERNORMALS; if (ap->a_attrnamespace & EXTATTR_NAMESPACE_SYSTEM) xfs_flags |= ATTR_KERNROOTLS; XVOP_ATTR_REMOVE(VPTOXFSVP(ap->a_vp), ap->a_name, xfs_flags, ap->a_cred, error); return (error); } static int _xfs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { printf("xfs_vptofh"); return ENOSYS; } Index: head/sys/i386/ibcs2/ibcs2_misc.c =================================================================== --- head/sys/i386/ibcs2/ibcs2_misc.c (revision 175201) +++ head/sys/i386/ibcs2/ibcs2_misc.c (revision 175202) @@ -1,1246 +1,1246 @@ /*- * Copyright (c) 1995 Steven Wallace * Copyright (c) 1994, 1995 Scott Bartram * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp * * @(#)sun_misc.c 8.1 (Berkeley) 6/18/93 */ #include __FBSDID("$FreeBSD$"); /* * IBCS2 compatibility module. * * IBCS2 system calls that are implemented differently in BSD are * handled here. */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int ibcs2_ulimit(td, uap) struct thread *td; struct ibcs2_ulimit_args *uap; { struct rlimit rl; struct proc *p; int error; #define IBCS2_GETFSIZE 1 #define IBCS2_SETFSIZE 2 #define IBCS2_GETPSIZE 3 #define IBCS2_GETDTABLESIZE 4 p = td->td_proc; switch (uap->cmd) { case IBCS2_GETFSIZE: PROC_LOCK(p); td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE); PROC_UNLOCK(p); if (td->td_retval[0] == -1) td->td_retval[0] = 0x7fffffff; return 0; case IBCS2_SETFSIZE: PROC_LOCK(p); rl.rlim_max = lim_max(p, RLIMIT_FSIZE); PROC_UNLOCK(p); rl.rlim_cur = uap->newlimit; error = kern_setrlimit(td, RLIMIT_FSIZE, &rl); if (!error) { PROC_LOCK(p); td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE); PROC_UNLOCK(p); } else { DPRINTF(("failed ")); } return error; case IBCS2_GETPSIZE: PROC_LOCK(p); td->td_retval[0] = lim_cur(p, RLIMIT_RSS); /* XXX */ PROC_UNLOCK(p); return 0; case IBCS2_GETDTABLESIZE: uap->cmd = IBCS2_SC_OPEN_MAX; return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap); default: return ENOSYS; } } #define IBCS2_WSTOPPED 0177 #define IBCS2_STOPCODE(sig) ((sig) << 8 | IBCS2_WSTOPPED) int ibcs2_wait(td, uap) struct thread *td; struct ibcs2_wait_args *uap; { int error, options, status; int *statusp; pid_t pid; struct trapframe *tf = td->td_frame; if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V)) == (PSL_Z|PSL_PF|PSL_N|PSL_V)) { /* waitpid */ pid = uap->a1; statusp = (int *)uap->a2; options = uap->a3; } else { /* wait */ pid = WAIT_ANY; statusp = (int *)uap->a1; options = 0; } error = kern_wait(td, pid, &status, options, NULL); if (error) return error; if (statusp) { /* * Convert status/signal result. */ if (WIFSTOPPED(status)) { if (WSTOPSIG(status) <= 0 || WSTOPSIG(status) > IBCS2_SIGTBLSZ) return (EINVAL); status = IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]); } else if (WIFSIGNALED(status)) { if (WTERMSIG(status) <= 0 || WTERMSIG(status) > IBCS2_SIGTBLSZ) return (EINVAL); status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))]; } /* else exit status -- identical */ /* record result/status */ td->td_retval[1] = status; return copyout(&status, statusp, sizeof(status)); } return 0; } int ibcs2_execv(td, uap) struct thread *td; struct ibcs2_execv_args *uap; { struct image_args eargs; char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL); free(path, M_TEMP); if (error == 0) error = kern_execve(td, &eargs, NULL); return (error); } int ibcs2_execve(td, uap) struct thread *td; struct ibcs2_execve_args *uap; { struct image_args eargs; char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, uap->envp); free(path, M_TEMP); if (error == 0) error = kern_execve(td, &eargs, NULL); return (error); } int ibcs2_umount(td, uap) struct thread *td; struct ibcs2_umount_args *uap; { struct unmount_args um; um.path = uap->name; um.flags = 0; return unmount(td, &um); } int ibcs2_mount(td, uap) struct thread *td; struct ibcs2_mount_args *uap; { #ifdef notyet int oflags = uap->flags, nflags, error; char fsname[MFSNAMELEN]; if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5)) return (EINVAL); if ((oflags & IBCS2_MS_NEWTYPE) == 0) return (EINVAL); nflags = 0; if (oflags & IBCS2_MS_RDONLY) nflags |= MNT_RDONLY; if (oflags & IBCS2_MS_NOSUID) nflags |= MNT_NOSUID; if (oflags & IBCS2_MS_REMOUNT) nflags |= MNT_UPDATE; uap->flags = nflags; if (error = copyinstr((caddr_t)uap->type, fsname, sizeof fsname, (u_int *)0)) return (error); if (strcmp(fsname, "4.2") == 0) { uap->type = (caddr_t)STACK_ALLOC(); if (error = copyout("ufs", uap->type, sizeof("ufs"))) return (error); } else if (strcmp(fsname, "nfs") == 0) { struct ibcs2_nfs_args sna; struct sockaddr_in sain; struct nfs_args na; struct sockaddr sa; if (error = copyin(uap->data, &sna, sizeof sna)) return (error); if (error = copyin(sna.addr, &sain, sizeof sain)) return (error); bcopy(&sain, &sa, sizeof sa); sa.sa_len = sizeof(sain); uap->data = (caddr_t)STACK_ALLOC(); na.addr = (struct sockaddr *)((int)uap->data + sizeof na); na.sotype = SOCK_DGRAM; na.proto = IPPROTO_UDP; na.fh = (nfsv2fh_t *)sna.fh; na.flags = sna.flags; na.wsize = sna.wsize; na.rsize = sna.rsize; na.timeo = sna.timeo; na.retrans = sna.retrans; na.hostname = sna.hostname; if (error = copyout(&sa, na.addr, sizeof sa)) return (error); if (error = copyout(&na, uap->data, sizeof na)) return (error); } return (mount(td, uap)); #else return EINVAL; #endif } /* * Read iBCS2-style directory entries. We suck them into kernel space so * that they can be massaged before being copied out to user code. Like * SunOS, we squish out `empty' entries. * * This is quite ugly, but what do you expect from compatibility code? */ int ibcs2_getdents(td, uap) struct thread *td; register struct ibcs2_getdents_args *uap; { register struct vnode *vp; register caddr_t inp, buf; /* BSD-format */ register int len, reclen; /* BSD-format */ register caddr_t outp; /* iBCS2-format */ register int resid; /* iBCS2-format */ struct file *fp; struct uio auio; struct iovec aiov; struct ibcs2_dirent idb; off_t off; /* true file offset */ int buflen, error, eofflag, vfslocked; u_long *cookies = NULL, *cookiep; int ncookies; #define BSD_DIRENT(cp) ((struct dirent *)(cp)) #define IBCS2_RECLEN(reclen) (reclen + sizeof(u_short)) if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { /* XXX vnode readdir op should do this */ VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } off = fp->f_offset; #define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */ buflen = max(DIRBLKSIZ, uap->nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) goto out; #endif /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) goto out; inp = buf; outp = uap->buf; resid = uap->nbytes; if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { len -= BSD_DIRENT(inp)->d_reclen; inp += BSD_DIRENT(inp)->d_reclen; cookiep++; ncookies--; } } for (; len > 0; len -= reclen) { if (cookiep && ncookies == 0) break; reclen = BSD_DIRENT(inp)->d_reclen; if (reclen & 3) { printf("ibcs2_getdents: reclen=%d\n", reclen); error = EFAULT; goto out; } if (BSD_DIRENT(inp)->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; continue; } if (reclen > len || resid < IBCS2_RECLEN(reclen)) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make an iBCS2-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). */ idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno; idb.d_off = (ibcs2_off_t)off; idb.d_reclen = (u_short)IBCS2_RECLEN(reclen); if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 || (error = copyout(BSD_DIRENT(inp)->d_name, outp + 10, BSD_DIRENT(inp)->d_namlen + 1)) != 0) goto out; /* advance past this real entry */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; inp += reclen; /* advance output past iBCS2-shaped entry */ outp += IBCS2_RECLEN(reclen); resid -= IBCS2_RECLEN(reclen); } /* if we squished out the whole block, try again */ if (outp == uap->buf) goto again; fp->f_offset = off; /* update the vnode offset */ eof: td->td_retval[0] = uap->nbytes - resid; out: VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); if (cookies) free(cookies, M_TEMP); free(buf, M_TEMP); return (error); } int ibcs2_read(td, uap) struct thread *td; struct ibcs2_read_args *uap; { register struct vnode *vp; register caddr_t inp, buf; /* BSD-format */ register int len, reclen; /* BSD-format */ register caddr_t outp; /* iBCS2-format */ register int resid; /* iBCS2-format */ struct file *fp; struct uio auio; struct iovec aiov; struct ibcs2_direct { ibcs2_ino_t ino; char name[14]; } idb; off_t off; /* true file offset */ int buflen, error, eofflag, size, vfslocked; u_long *cookies = NULL, *cookiep; int ncookies; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) { if (error == EINVAL) return read(td, (struct read_args *)uap); else return error; } if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return read(td, (struct read_args *)uap); } off = fp->f_offset; DPRINTF(("ibcs2_read: read directory\n")); buflen = max(DIRBLKSIZ, uap->nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) goto out; #endif /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) { DPRINTF(("VOP_READDIR failed: %d\n", error)); goto out; } inp = buf; outp = uap->buf; resid = uap->nbytes; if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { len -= BSD_DIRENT(inp)->d_reclen; inp += BSD_DIRENT(inp)->d_reclen; cookiep++; ncookies--; } } for (; len > 0 && resid > 0; len -= reclen) { if (cookiep && ncookies == 0) break; reclen = BSD_DIRENT(inp)->d_reclen; if (reclen & 3) { printf("ibcs2_read: reclen=%d\n", reclen); error = EFAULT; goto out; } if (BSD_DIRENT(inp)->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; continue; } if (reclen > len || resid < sizeof(struct ibcs2_direct)) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make an iBCS2-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). * * TODO: if length(filename) > 14, then break filename into * multiple entries and set inode = 0xffff except last */ idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe : BSD_DIRENT(inp)->d_fileno; (void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size); bzero(idb.name + size, 14 - size); if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0) goto out; /* advance past this real entry */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; inp += reclen; /* advance output past iBCS2-shaped entry */ outp += sizeof(struct ibcs2_direct); resid -= sizeof(struct ibcs2_direct); } /* if we squished out the whole block, try again */ if (outp == uap->buf) goto again; fp->f_offset = off; /* update the vnode offset */ eof: td->td_retval[0] = uap->nbytes - resid; out: VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); if (cookies) free(cookies, M_TEMP); free(buf, M_TEMP); return (error); } int ibcs2_mknod(td, uap) struct thread *td; struct ibcs2_mknod_args *uap; { char *path; int error; CHECKALTCREAT(td, uap->path, &path); if (S_ISFIFO(uap->mode)) error = kern_mkfifo(td, path, UIO_SYSSPACE, uap->mode); else error = kern_mknod(td, path, UIO_SYSSPACE, uap->mode, uap->dev); free(path, M_TEMP); return (error); } int ibcs2_getgroups(td, uap) struct thread *td; struct ibcs2_getgroups_args *uap; { ibcs2_gid_t iset[NGROUPS_MAX]; gid_t gp[NGROUPS_MAX]; u_int i, ngrp; int error; if (uap->gidsetsize < 0) return (EINVAL); ngrp = MIN(uap->gidsetsize, NGROUPS_MAX); error = kern_getgroups(td, &ngrp, gp); if (error) return (error); if (uap->gidsetsize > 0) { for (i = 0; i < ngrp; i++) iset[i] = (ibcs2_gid_t)gp[i]; error = copyout(iset, uap->gidset, ngrp * sizeof(ibcs2_gid_t)); } if (error == 0) td->td_retval[0] = ngrp; return (error); } int ibcs2_setgroups(td, uap) struct thread *td; struct ibcs2_setgroups_args *uap; { ibcs2_gid_t iset[NGROUPS_MAX]; gid_t gp[NGROUPS_MAX]; int error, i; if (uap->gidsetsize < 0 || uap->gidsetsize > NGROUPS_MAX) return (EINVAL); if (uap->gidsetsize && uap->gidset) { error = copyin(uap->gidset, iset, sizeof(ibcs2_gid_t) * uap->gidsetsize); if (error) return (error); for (i = 0; i < uap->gidsetsize; i++) gp[i] = (gid_t)iset[i]; } return (kern_setgroups(td, uap->gidsetsize, gp)); } int ibcs2_setuid(td, uap) struct thread *td; struct ibcs2_setuid_args *uap; { struct setuid_args sa; sa.uid = (uid_t)uap->uid; return setuid(td, &sa); } int ibcs2_setgid(td, uap) struct thread *td; struct ibcs2_setgid_args *uap; { struct setgid_args sa; sa.gid = (gid_t)uap->gid; return setgid(td, &sa); } int ibcs2_time(td, uap) struct thread *td; struct ibcs2_time_args *uap; { struct timeval tv; microtime(&tv); td->td_retval[0] = tv.tv_sec; if (uap->tp) return copyout((caddr_t)&tv.tv_sec, (caddr_t)uap->tp, sizeof(ibcs2_time_t)); else return 0; } int ibcs2_pathconf(td, uap) struct thread *td; struct ibcs2_pathconf_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); uap->name++; /* iBCS2 _PC_* defines are offset by one */ error = kern_pathconf(td, path, UIO_SYSSPACE, uap->name); free(path, M_TEMP); return (error); } int ibcs2_fpathconf(td, uap) struct thread *td; struct ibcs2_fpathconf_args *uap; { uap->name++; /* iBCS2 _PC_* defines are offset by one */ return fpathconf(td, (struct fpathconf_args *)uap); } int ibcs2_sysconf(td, uap) struct thread *td; struct ibcs2_sysconf_args *uap; { int mib[2], value, len, error; struct proc *p; p = td->td_proc; switch(uap->name) { case IBCS2_SC_ARG_MAX: mib[1] = KERN_ARGMAX; break; case IBCS2_SC_CHILD_MAX: PROC_LOCK(p); td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NPROC); PROC_UNLOCK(p); return 0; case IBCS2_SC_CLK_TCK: td->td_retval[0] = hz; return 0; case IBCS2_SC_NGROUPS_MAX: mib[1] = KERN_NGROUPS; break; case IBCS2_SC_OPEN_MAX: PROC_LOCK(p); td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NOFILE); PROC_UNLOCK(p); return 0; case IBCS2_SC_JOB_CONTROL: mib[1] = KERN_JOB_CONTROL; break; case IBCS2_SC_SAVED_IDS: mib[1] = KERN_SAVED_IDS; break; case IBCS2_SC_VERSION: mib[1] = KERN_POSIX1; break; case IBCS2_SC_PASS_MAX: td->td_retval[0] = 128; /* XXX - should we create PASS_MAX ? */ return 0; case IBCS2_SC_XOPEN_VERSION: td->td_retval[0] = 2; /* XXX: What should that be? */ return 0; default: return EINVAL; } mib[0] = CTL_KERN; len = sizeof(value); error = kernel_sysctl(td, mib, 2, &value, &len, NULL, 0, NULL, 0); if (error) return error; td->td_retval[0] = value; return 0; } int ibcs2_alarm(td, uap) struct thread *td; struct ibcs2_alarm_args *uap; { struct itimerval itv, oitv; int error; timevalclear(&itv.it_interval); itv.it_value.tv_sec = uap->sec; itv.it_value.tv_usec = 0; error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv); if (error) return (error); if (oitv.it_value.tv_usec != 0) oitv.it_value.tv_sec++; td->td_retval[0] = oitv.it_value.tv_sec; return (0); } int ibcs2_times(td, uap) struct thread *td; struct ibcs2_times_args *uap; { struct rusage ru; struct timeval t; struct tms tms; int error; #define CONVTCK(r) (r.tv_sec * hz + r.tv_usec / (1000000 / hz)) error = kern_getrusage(td, RUSAGE_SELF, &ru); if (error) return (error); tms.tms_utime = CONVTCK(ru.ru_utime); tms.tms_stime = CONVTCK(ru.ru_stime); error = kern_getrusage(td, RUSAGE_CHILDREN, &ru); if (error) return (error); tms.tms_cutime = CONVTCK(ru.ru_utime); tms.tms_cstime = CONVTCK(ru.ru_stime); microtime(&t); td->td_retval[0] = CONVTCK(t); return (copyout(&tms, uap->tp, sizeof(struct tms))); } int ibcs2_stime(td, uap) struct thread *td; struct ibcs2_stime_args *uap; { struct timeval tv; long secs; int error; error = copyin(uap->timep, &secs, sizeof(long)); if (error) return (error); tv.tv_sec = secs; tv.tv_usec = 0; error = kern_settimeofday(td, &tv, NULL); if (error) error = EPERM; return (error); } int ibcs2_utime(td, uap) struct thread *td; struct ibcs2_utime_args *uap; { struct ibcs2_utimbuf ubuf; struct timeval tbuf[2], *tp; char *path; int error; if (uap->buf) { error = copyin(uap->buf, &ubuf, sizeof(ubuf)); if (error) return (error); tbuf[0].tv_sec = ubuf.actime; tbuf[0].tv_usec = 0; tbuf[1].tv_sec = ubuf.modtime; tbuf[1].tv_usec = 0; tp = tbuf; } else tp = NULL; CHECKALTEXIST(td, uap->path, &path); error = kern_utimes(td, path, UIO_SYSSPACE, tp, UIO_SYSSPACE); free(path, M_TEMP); return (error); } int ibcs2_nice(td, uap) struct thread *td; struct ibcs2_nice_args *uap; { int error; struct setpriority_args sa; sa.which = PRIO_PROCESS; sa.who = 0; sa.prio = td->td_proc->p_nice + uap->incr; if ((error = setpriority(td, &sa)) != 0) return EPERM; td->td_retval[0] = td->td_proc->p_nice; return 0; } /* * iBCS2 getpgrp, setpgrp, setsid, and setpgid */ int ibcs2_pgrpsys(td, uap) struct thread *td; struct ibcs2_pgrpsys_args *uap; { struct proc *p = td->td_proc; switch (uap->type) { case 0: /* getpgrp */ PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; case 1: /* setpgrp */ { struct setpgid_args sa; sa.pid = 0; sa.pgid = 0; setpgid(td, &sa); PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; } case 2: /* setpgid */ { struct setpgid_args sa; sa.pid = uap->pid; sa.pgid = uap->pgid; return setpgid(td, &sa); } case 3: /* setsid */ return setsid(td, NULL); default: return EINVAL; } } /* * XXX - need to check for nested calls */ int ibcs2_plock(td, uap) struct thread *td; struct ibcs2_plock_args *uap; { int error; #define IBCS2_UNLOCK 0 #define IBCS2_PROCLOCK 1 #define IBCS2_TEXTLOCK 2 #define IBCS2_DATALOCK 4 switch(uap->cmd) { case IBCS2_UNLOCK: error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); /* XXX - TODO */ return (0); case IBCS2_PROCLOCK: case IBCS2_TEXTLOCK: case IBCS2_DATALOCK: error = priv_check(td, PRIV_VM_MLOCK); if (error) return (error); /* XXX - TODO */ return 0; } return EINVAL; } int ibcs2_uadmin(td, uap) struct thread *td; struct ibcs2_uadmin_args *uap; { #define SCO_A_REBOOT 1 #define SCO_A_SHUTDOWN 2 #define SCO_A_REMOUNT 4 #define SCO_A_CLOCK 8 #define SCO_A_SETCONFIG 128 #define SCO_A_GETDEV 130 #define SCO_AD_HALT 0 #define SCO_AD_BOOT 1 #define SCO_AD_IBOOT 2 #define SCO_AD_PWRDOWN 3 #define SCO_AD_PWRNAP 4 #define SCO_AD_PANICBOOT 1 #define SCO_AD_GETBMAJ 0 #define SCO_AD_GETCMAJ 1 switch(uap->cmd) { case SCO_A_REBOOT: case SCO_A_SHUTDOWN: switch(uap->func) { struct reboot_args r; case SCO_AD_HALT: case SCO_AD_PWRDOWN: case SCO_AD_PWRNAP: r.opt = RB_HALT; return (reboot(td, &r)); case SCO_AD_BOOT: case SCO_AD_IBOOT: r.opt = RB_AUTOBOOT; return (reboot(td, &r)); } return EINVAL; case SCO_A_REMOUNT: case SCO_A_CLOCK: case SCO_A_SETCONFIG: return 0; case SCO_A_GETDEV: return EINVAL; /* XXX - TODO */ } return EINVAL; } int ibcs2_sysfs(td, uap) struct thread *td; struct ibcs2_sysfs_args *uap; { #define IBCS2_GETFSIND 1 #define IBCS2_GETFSTYP 2 #define IBCS2_GETNFSTYP 3 switch(uap->cmd) { case IBCS2_GETFSIND: case IBCS2_GETFSTYP: case IBCS2_GETNFSTYP: break; } return EINVAL; /* XXX - TODO */ } int ibcs2_unlink(td, uap) struct thread *td; struct ibcs2_unlink_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_unlink(td, path, UIO_SYSSPACE); free(path, M_TEMP); return (error); } int ibcs2_chdir(td, uap) struct thread *td; struct ibcs2_chdir_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_chdir(td, path, UIO_SYSSPACE); free(path, M_TEMP); return (error); } int ibcs2_chmod(td, uap) struct thread *td; struct ibcs2_chmod_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_chmod(td, path, UIO_SYSSPACE, uap->mode); free(path, M_TEMP); return (error); } int ibcs2_chown(td, uap) struct thread *td; struct ibcs2_chown_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_chown(td, path, UIO_SYSSPACE, uap->uid, uap->gid); free(path, M_TEMP); return (error); } int ibcs2_rmdir(td, uap) struct thread *td; struct ibcs2_rmdir_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_rmdir(td, path, UIO_SYSSPACE); free(path, M_TEMP); return (error); } int ibcs2_mkdir(td, uap) struct thread *td; struct ibcs2_mkdir_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_mkdir(td, path, UIO_SYSSPACE, uap->mode); free(path, M_TEMP); return (error); } int ibcs2_symlink(td, uap) struct thread *td; struct ibcs2_symlink_args *uap; { char *path, *link; int error; CHECKALTEXIST(td, uap->path, &path); /* * Have to expand CHECKALTCREAT() so that 'path' can be freed on * errors. */ error = ibcs2_emul_find(td, uap->link, UIO_USERSPACE, &link, 1); if (link == NULL) { free(path, M_TEMP); return (error); } error = kern_symlink(td, path, link, UIO_SYSSPACE); free(path, M_TEMP); free(link, M_TEMP); return (error); } int ibcs2_rename(td, uap) struct thread *td; struct ibcs2_rename_args *uap; { char *from, *to; int error; CHECKALTEXIST(td, uap->from, &from); /* * Have to expand CHECKALTCREAT() so that 'from' can be freed on * errors. */ error = ibcs2_emul_find(td, uap->to, UIO_USERSPACE, &to, 1); if (to == NULL) { free(from, M_TEMP); return (error); } error = kern_rename(td, from, to, UIO_SYSSPACE); free(from, M_TEMP); free(to, M_TEMP); return (error); } int ibcs2_readlink(td, uap) struct thread *td; struct ibcs2_readlink_args *uap; { char *path; int error; CHECKALTEXIST(td, uap->path, &path); error = kern_readlink(td, path, UIO_SYSSPACE, uap->buf, UIO_USERSPACE, uap->count); free(path, M_TEMP); return (error); } Index: head/sys/i386/ibcs2/imgact_coff.c =================================================================== --- head/sys/i386/ibcs2/imgact_coff.c (revision 175201) +++ head/sys/i386/ibcs2/imgact_coff.c (revision 175202) @@ -1,497 +1,497 @@ /*- * Copyright (c) 1994 Sean Eric Fagan * Copyright (c) 1994 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_DEPEND(coff, ibcs2, 1, 1, 1); extern struct sysentvec ibcs2_svr3_sysvec; static int coff_load_file(struct thread *td, char *name); static int exec_coff_imgact(struct image_params *imgp); static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot); static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) { size_t map_len; vm_offset_t map_offset; vm_offset_t map_addr; int error; unsigned char *data_buf = 0; size_t copy_len; map_offset = trunc_page(offset); map_addr = trunc_page((vm_offset_t)vmaddr); if (memsz > filsz) { /* * We have the stupid situation that * the section is longer than it is on file, * which means it has zero-filled areas, and * we have to work for it. Stupid iBCS! */ map_len = trunc_page(offset + filsz) - trunc_page(map_offset); } else { /* * The only stuff we care about is on disk, and we * don't care if we map in more than is really there. */ map_len = round_page(offset + filsz) - trunc_page(map_offset); } DPRINTF(("%s(%d): vm_mmap(&vmspace->vm_map, &0x%08lx, 0x%x, 0x%x, " "VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, 0x%x)\n", __FILE__, __LINE__, map_addr, map_len, prot, map_offset)); if ((error = vm_mmap(&vmspace->vm_map, &map_addr, map_len, prot, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, map_offset)) != 0) return error; if (memsz == filsz) { /* We're done! */ return 0; } /* * Now we have screwball stuff, to accomodate stupid COFF. * We have to map the remaining bit of the file into the kernel's * memory map, allocate some anonymous memory, copy that last * bit into it, and then we're done. *sigh* * For clean-up reasons, we actally map in the file last. */ copy_len = (offset + filsz) - trunc_page(offset + filsz); map_addr = trunc_page((vm_offset_t)vmaddr + filsz); map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr; DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx,0x%x, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, map_addr, map_len)); if (map_len != 0) { error = vm_map_find(&vmspace->vm_map, NULL, 0, &map_addr, map_len, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return error; } if ((error = vm_mmap(kernel_map, (vm_offset_t *) &data_buf, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, trunc_page(offset + filsz))) != 0) return error; error = copyout(data_buf, (caddr_t) map_addr, copy_len); if (vm_map_remove(kernel_map, (vm_offset_t) data_buf, (vm_offset_t) data_buf + PAGE_SIZE)) panic("load_coff_section vm_map_remove failed"); return error; } static int coff_load_file(struct thread *td, char *name) { struct proc *p = td->td_proc; struct vmspace *vmspace = p->p_vmspace; int error; struct nameidata nd; struct vnode *vp; struct vattr attr; struct filehdr *fhdr; struct aouthdr *ahdr; struct scnhdr *scns; char *ptr = 0; int nscns; unsigned long text_offset = 0, text_address = 0, text_size = 0; unsigned long data_offset = 0, data_address = 0, data_size = 0; unsigned long bss_size = 0; int i; NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, name, td); error = namei(&nd); if (error) return error; vp = nd.ni_vp; if (vp == NULL) return ENOEXEC; if (vp->v_writecount) { error = ETXTBSY; goto fail; } if ((error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) != 0) goto fail; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) goto fail; if (attr.va_size == 0) { error = ENOEXEC; goto fail; } if ((error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td)) != 0) goto fail; if ((error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL)) != 0) goto fail; /* * Lose the lock on the vnode. It's no longer needed, and must not * exist for the pagefault paging to work below. */ VOP_UNLOCK(vp, 0, td); if ((error = vm_mmap(kernel_map, (vm_offset_t *) &ptr, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0)) != 0) goto unlocked_fail; fhdr = (struct filehdr *)ptr; if (fhdr->f_magic != I386_COFF) { error = ENOEXEC; goto dealloc_and_fail; } nscns = fhdr->f_nscns; if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) { /* * XXX -- just fail. I'm so lazy. */ error = ENOEXEC; goto dealloc_and_fail; } ahdr = (struct aouthdr*)(ptr + sizeof(struct filehdr)); scns = (struct scnhdr*)(ptr + sizeof(struct filehdr) + sizeof(struct aouthdr)); for (i = 0; i < nscns; i++) { if (scns[i].s_flags & STYP_NOLOAD) continue; else if (scns[i].s_flags & STYP_TEXT) { text_address = scns[i].s_vaddr; text_size = scns[i].s_size; text_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_DATA) { data_address = scns[i].s_vaddr; data_size = scns[i].s_size; data_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_BSS) { bss_size = scns[i].s_size; } } if ((error = load_coff_section(vmspace, vp, text_offset, (caddr_t)(void *)(uintptr_t)text_address, text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE)) != 0) { goto dealloc_and_fail; } if ((error = load_coff_section(vmspace, vp, data_offset, (caddr_t)(void *)(uintptr_t)data_address, data_size + bss_size, data_size, VM_PROT_ALL)) != 0) { goto dealloc_and_fail; } error = 0; dealloc_and_fail: if (vm_map_remove(kernel_map, (vm_offset_t) ptr, (vm_offset_t) ptr + PAGE_SIZE)) panic("%s vm_map_remove failed", __func__); fail: VOP_UNLOCK(vp, 0, td); unlocked_fail: NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); return error; } static int exec_coff_imgact(imgp) struct image_params *imgp; { const struct filehdr *fhdr = (const struct filehdr*)imgp->image_header; const struct aouthdr *ahdr; const struct scnhdr *scns; int i; struct vmspace *vmspace; int nscns; int error; unsigned long text_offset = 0, text_address = 0, text_size = 0; unsigned long data_offset = 0, data_address = 0, data_size = 0; unsigned long bss_size = 0; caddr_t hole; struct thread *td = curthread; if (fhdr->f_magic != I386_COFF || !(fhdr->f_flags & F_EXEC)) { DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__)); return -1; } nscns = fhdr->f_nscns; if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) { /* * For now, return an error -- need to be able to * read in all of the section structures. */ DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__)); return -1; } ahdr = (const struct aouthdr*) ((const char*)(imgp->image_header) + sizeof(struct filehdr)); imgp->entry_addr = ahdr->entry; scns = (const struct scnhdr*) ((const char*)(imgp->image_header) + sizeof(struct filehdr) + sizeof(struct aouthdr)); VOP_UNLOCK(imgp->vp, 0, td); error = exec_new_vmspace(imgp, &ibcs2_svr3_sysvec); if (error) goto fail; vmspace = imgp->proc->p_vmspace; for (i = 0; i < nscns; i++) { DPRINTF(("i = %d, scns[i].s_name = %s, scns[i].s_vaddr = %08lx, " "scns[i].s_scnptr = %d\n", i, scns[i].s_name, scns[i].s_vaddr, scns[i].s_scnptr)); if (scns[i].s_flags & STYP_NOLOAD) { /* * A section that is not loaded, for whatever * reason. It takes precedance over other flag * bits... */ continue; } else if (scns[i].s_flags & STYP_TEXT) { text_address = scns[i].s_vaddr; text_size = scns[i].s_size; text_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_DATA) { /* .data section */ data_address = scns[i].s_vaddr; data_size = scns[i].s_size; data_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_BSS) { /* .bss section */ bss_size = scns[i].s_size; } else if (scns[i].s_flags & STYP_LIB) { char *buf = 0; int foff = trunc_page(scns[i].s_scnptr); int off = scns[i].s_scnptr - foff; int len = round_page(scns[i].s_size + PAGE_SIZE); int j; if ((error = vm_mmap(kernel_map, (vm_offset_t *) &buf, len, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, imgp->vp, foff)) != 0) { error = ENOEXEC; goto fail; } if(scns[i].s_size) { char *libbuf; int emul_path_len = strlen(ibcs2_emul_path); libbuf = malloc(MAXPATHLEN + emul_path_len, M_TEMP, M_WAITOK); strcpy(libbuf, ibcs2_emul_path); for (j = off; j < scns[i].s_size + off;) { long stroff, nextoff; char *libname; nextoff = 4 * *(long *)(buf + j); stroff = 4 * *(long *)(buf + j + sizeof(long)); libname = buf + j + stroff; j += nextoff; DPRINTF(("%s(%d): shared library %s\n", __FILE__, __LINE__, libname)); strlcpy(&libbuf[emul_path_len], libname, MAXPATHLEN); /* XXXKSE only 1:1 in coff */ error = coff_load_file( FIRST_THREAD_IN_PROC(imgp->proc), libbuf); if (error) error = coff_load_file( FIRST_THREAD_IN_PROC(imgp->proc), libname); if (error) break; } free(libbuf, M_TEMP); } if (vm_map_remove(kernel_map, (vm_offset_t) buf, (vm_offset_t) buf + len)) panic("exec_coff_imgact vm_map_remove failed"); if (error) goto fail; } } /* * Map in .text now */ DPRINTF(("%s(%d): load_coff_section(vmspace, " "imgp->vp, %08lx, %08lx, 0x%x, 0x%x, 0x%x)\n", __FILE__, __LINE__, text_offset, text_address, text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE)); if ((error = load_coff_section(vmspace, imgp->vp, text_offset, (caddr_t)(void *)(uintptr_t)text_address, text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE)) != 0) { DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error)); goto fail; } /* * Map in .data and .bss now */ DPRINTF(("%s(%d): load_coff_section(vmspace, " "imgp->vp, 0x%08lx, 0x%08lx, 0x%x, 0x%x, 0x%x)\n", __FILE__, __LINE__, data_offset, data_address, data_size + bss_size, data_size, VM_PROT_ALL)); if ((error = load_coff_section(vmspace, imgp->vp, data_offset, (caddr_t)(void *)(uintptr_t)data_address, data_size + bss_size, data_size, VM_PROT_ALL)) != 0) { DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error)); goto fail; } imgp->interpreted = 0; imgp->proc->p_sysent = &ibcs2_svr3_sysvec; vmspace->vm_tsize = round_page(text_size) >> PAGE_SHIFT; vmspace->vm_dsize = round_page(data_size + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)text_address; vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)data_address; hole = (caddr_t)trunc_page((vm_offset_t)vmspace->vm_daddr) + ctob(vmspace->vm_dsize); DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, hole)); DPRINTF(("imgact: error = %d\n", error)); error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *) &hole, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); DPRINTF(("IBCS2: start vm_dsize = 0x%x, vm_daddr = 0x%x end = 0x%x\n", ctob(vmspace->vm_dsize), vmspace->vm_daddr, ctob(vmspace->vm_dsize) + vmspace->vm_daddr )); DPRINTF(("%s(%d): returning successfully!\n", __FILE__, __LINE__)); fail: - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); return error; } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw coff_execsw = { exec_coff_imgact, "coff" }; EXEC_SET(coff, coff_execsw); Index: head/sys/i386/linux/imgact_linux.c =================================================================== --- head/sys/i386/linux/imgact_linux.c (revision 175201) +++ head/sys/i386/linux/imgact_linux.c (revision 175202) @@ -1,247 +1,247 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Based heavily on /sys/kern/imgact_aout.c which is: * Copyright (c) 1993, David Greenman * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_linux_imgact(struct image_params *iparams); static int exec_linux_imgact(struct image_params *imgp) { const struct exec *a_out = (const struct exec *) imgp->image_header; struct vmspace *vmspace; vm_offset_t vmaddr; unsigned long virtual_offset, file_offset; vm_offset_t buffer; unsigned long bss_size; struct thread *td = curthread; int error; if (((a_out->a_magic >> 16) & 0xff) != 0x64) return -1; /* * Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: virtual_offset = 0; file_offset = 1024; break; case 0314: virtual_offset = 4096; file_offset = 0; break; default: return (-1); } bss_size = round_page(a_out->a_bss); #ifdef DEBUG printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", (u_long)a_out->a_text, (u_long)a_out->a_data, bss_size); #endif /* * Check various fields in header for validity/bounds. */ if (a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); VOP_UNLOCK(imgp->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ error = exec_new_vmspace(imgp, &linux_sysvec); if (error) goto fail; vmspace = imgp->proc->p_vmspace; /* * Check if file_offset page aligned,. * Currently we cannot handle misaligned file offsets, * and so we read in the entire image (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("imgact: Non page aligned binary %lu\n", file_offset); #endif /* * Map text+data+bss read/write/execute */ vmaddr = virtual_offset; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data + bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; error = vm_mmap(kernel_map, &buffer, round_page(a_out->a_text + a_out->a_data + file_offset), VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, imgp->vp, trunc_page(file_offset)); if (error) goto fail; error = copyout((void *)(uintptr_t)(buffer + file_offset), (void *)vmaddr, a_out->a_text + a_out->a_data); vm_map_remove(kernel_map, buffer, buffer + round_page(a_out->a_text + a_out->a_data + file_offset)); if (error) goto fail; /* * remove write enable on the 'text' part */ error = vm_map_protect(&vmspace->vm_map, vmaddr, vmaddr + a_out->a_text, VM_PROT_EXECUTE|VM_PROT_READ, TRUE); if (error) goto fail; } else { #ifdef DEBUG printf("imgact: Page aligned binary %lu\n", file_offset); #endif /* * Map text+data read/execute */ vmaddr = virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, imgp->vp, file_offset); if (error) goto fail; #ifdef DEBUG printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr, (u_long)a_out->a_text + (u_long)a_out->a_data); #endif /* * allow read/write of data */ error = vm_map_protect(&vmspace->vm_map, vmaddr + a_out->a_text, vmaddr + a_out->a_text + a_out->a_data, VM_PROT_ALL, FALSE); if (error) goto fail; /* * Allocate anon demand-zeroed area for uninitialized data */ if (bss_size != 0) { vmaddr = virtual_offset + a_out->a_text + a_out->a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; #ifdef DEBUG printf("imgact: bssaddr=%08lx, length=%08lx\n", (u_long)vmaddr, bss_size); #endif } /* Indicate that this file should not be modified */ mp_fixme("Unlocked v_flag access"); imgp->vp->v_vflag |= VV_TEXT; } /* Fill in process VM information */ vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT; vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)virtual_offset; vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &linux_sysvec; fail: - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw linux_execsw = { exec_linux_imgact, "linux a.out" }; EXEC_SET(linuxaout, linux_execsw); Index: head/sys/kern/imgact_aout.c =================================================================== --- head/sys/kern/imgact_aout.c (revision 175201) +++ head/sys/kern/imgact_aout.c (revision 175202) @@ -1,274 +1,273 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_aout_imgact(struct image_params *imgp); static int aout_fixup(register_t **stack_base, struct image_params *imgp); struct sysentvec aout_sysvec = { SYS_MAXSYSCALL, sysent, 0, 0, NULL, 0, NULL, NULL, aout_fixup, sendsig, sigcode, &szsigcode, NULL, "FreeBSD a.out", NULL, NULL, MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs, NULL }; static int aout_fixup(stack_base, imgp) register_t **stack_base; struct image_params *imgp; { return (suword(--(*stack_base), imgp->args->argc)); } static int exec_aout_imgact(imgp) struct image_params *imgp; { const struct exec *a_out = (const struct exec *) imgp->image_header; - struct thread *td = curthread; struct vmspace *vmspace; vm_map_t map; vm_object_t object; vm_offset_t text_end, data_end; unsigned long virtual_offset; unsigned long file_offset; unsigned long bss_size; int error; /* * Linux and *BSD binaries look very much alike, * only the machine id is different: * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. * NetBSD is in network byte order.. ugh. */ if (((a_out->a_magic >> 16) & 0xff) != 0x86 && ((a_out->a_magic >> 16) & 0xff) != 0 && ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) return -1; /* * Set file/virtual offset based on a.out variant. * We do two cases: host byte order and network byte order * (for NetBSD compatibility) */ switch ((int)(a_out->a_magic & 0xffff)) { case ZMAGIC: virtual_offset = 0; if (a_out->a_text) { file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ file_offset = 0; } break; case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; /* Pass PS_STRINGS for BSD/OS binaries only. */ if (N_GETMID(*a_out) == MID_ZERO) imgp->ps_strings = aout_sysvec.sv_psstrings; break; default: /* NetBSD compatibility */ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; break; default: return (-1); } } bss_size = roundup(a_out->a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if (/* entry point must lay with text region */ a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || /* text and data size must each be page rounded */ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (/* text can't exceed maximum text size */ a_out->a_text > maxtsiz || /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ - VOP_UNLOCK(imgp->vp, 0, td); + VOP_UNLOCK(imgp->vp, 0, curthread); /* * Destroy old process VM and create a new one (with a new stack) */ error = exec_new_vmspace(imgp, &aout_sysvec); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); /* * The vm space can be changed by exec_new_vmspace */ vmspace = imgp->proc->p_vmspace; object = imgp->object; map = &vmspace->vm_map; vm_map_lock(map); vm_object_reference(object); text_end = virtual_offset + a_out->a_text; error = vm_map_insert(map, object, file_offset, virtual_offset, text_end, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference(object); error = vm_map_insert(map, object, file_offset + a_out->a_text, text_end, data_end, VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } } if (bss_size) { error = vm_map_insert(map, NULL, 0, data_end, data_end + bss_size, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_map_unlock(map); return (error); } } vm_map_unlock(map); /* Fill in process VM information */ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &aout_sysvec; return (0); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; EXEC_SET(aout, aout_execsw); Index: head/sys/kern/imgact_elf.c =================================================================== --- head/sys/kern/imgact_elf.c (revision 175201) +++ head/sys/kern/imgact_elf.c (revision 175202) @@ -1,1356 +1,1356 @@ /*- * Copyright (c) 2000 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 #include #include #endif #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize); static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand", &__elfN(fallback_brand)); static int elf_trace = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, ""); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, ""); static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define trunc_page_ps(va, ps) ((va) & ~(ps - 1)) #define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1)) #define aligned(a, t) (trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a)) int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp) { Elf_Brandinfo *bi; int i; /* * We support three types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, and (3) path of the `interp_path' * field. We should also look for an ".note.ABI-tag" ELF section now * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones. */ /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0)) return (bi); } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && strcmp(interp, bi->interp_path) == 0) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { struct sf_buf *sf; int error; vm_offset_t off; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_lock(map); vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Find the page from the underlying object. */ if (object) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } } return (KERN_SUCCESS); } static int __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { struct sf_buf *sf; vm_offset_t off; vm_size_t sz; int error, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, round_page(start), prot); if (rv) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + trunc_page(end) - start, trunc_page(end), end, prot); if (rv) return (rv); end = trunc_page(end); } if (end > start) { if (offset & PAGE_MASK) { /* * The mapping is not page aligned. This means we have * to copy the data. Sigh. */ rv = vm_map_find(map, NULL, 0, &start, end - start, FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0); if (rv) return (rv); if (object == NULL) return (KERN_SUCCESS); for (; start < end; start += sz) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; if (sz > PAGE_SIZE - off) sz = PAGE_SIZE - off; error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } offset += sz; } rv = KERN_SUCCESS; } else { vm_object_reference(object); vm_map_lock(map); rv = vm_map_insert(map, object, offset, start, end, prot, VM_PROT_ALL, cow); vm_map_unlock(map); if (rv != KERN_SUCCESS) vm_object_deallocate(object); } return (rv); } else { return (KERN_SUCCESS); } } static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize) { struct sf_buf *sf; size_t map_len; vm_offset_t map_addr; int error, rv, cow; size_t copy_len; vm_offset_t file_addr; /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); file_addr = trunc_page_ps(offset, pagesize); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second.. */ if (memsz > filsz) map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; else map_len = round_page_ps(offset + filsz, pagesize) - file_addr; if (map_len != 0) { /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(&vmspace->vm_map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); /* we can stop now if we've covered it all */ if (memsz == filsz) { return (0); } } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr, map_addr + map_len, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (EINVAL); } } if (copy_len != 0) { vm_offset_t off; sf = vm_imgact_map_page(object, offset + filsz); if (sf == NULL) return (EIO); /* send the page fragment to user space */ off = trunc_page_ps(offset + filsz, pagesize) - trunc_page(offset + filsz); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)map_addr, copy_len); vm_imgact_unmap_page(sf); if (error) { return (error); } } /* * set it to the specified protection. * XXX had better undo the damage from pasting over the cracks here! */ vm_map_protect(&vmspace->vm_map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vmspace *vmspace = p->p_vmspace; struct vattr *attr; struct image_params *imgp; vm_prot_t prot; u_long rbase; u_long base_addr = 0; int vfslocked, error, i, numsegs; if (curthread->td_proc != p) panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */ tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; imgp->firstpage = NULL; imgp->image_header = NULL; imgp->object = NULL; imgp->execlabel = NULL; /* XXXKSE */ NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread); vfslocked = 0; if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } vfslocked = NDHASGIANT(nd); NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); if (error) goto fail; error = exec_map_first_page(imgp); if (error) goto fail; /* * Also make certain that the interpreter stays the same, so set * its VV_TEXT flag, too. */ nd->ni_vp->v_vflag |= VV_TEXT; imgp->object = nd->ni_vp->v_object; hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ /* (multiplication of two Elf_Half fields will not overflow) */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { error = ENOEXEC; goto fail; } for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */ prot = 0; if (phdr[i].p_flags & PF_X) prot |= VM_PROT_EXECUTE; if (phdr[i].p_flags & PF_W) prot |= VM_PROT_WRITE; if (phdr[i].p_flags & PF_R) prot |= VM_PROT_READ; if ((error = __elfN(load_section)(vmspace, imgp->object, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize)) != 0) goto fail; /* * Establish the base address if this is the * first segment. */ if (numsegs == 0) base_addr = trunc_page(phdr[i].p_vaddr + rbase); numsegs++; } } *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); if (nd->ni_vp) vput(nd->ni_vp); VFS_UNLOCK_GIANT(vfslocked); free(tempdata, M_TEMP); return (error); } static const char FREEBSD_ABI_VENDOR[] = "FreeBSD"; static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; const Elf_Phdr *phdr, *pnote = NULL; Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; vm_prot_t prot; u_long text_size = 0, data_size = 0, total_size = 0; u_long text_addr = 0, data_addr = 0; u_long seg_size, seg_addr; u_long addr, entry = 0, proghdr = 0; int error = 0, i; const char *interp = NULL, *newinterp = NULL; Elf_Brandinfo *brand_info; const Elf_Note *note, *note_end; char *path; const char *note_name; struct thread *td = curthread; struct sysentvec *sv; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { /* Only support headers in first page for now */ return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) return (ENOEXEC); for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_INTERP) { /* Path to interpreter */ if (phdr[i].p_filesz > MAXPATHLEN || phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) return (ENOEXEC); interp = imgp->image_header + phdr[i].p_offset; break; } } brand_info = __elfN(get_brandinfo)(hdr, interp); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); return (ENOEXEC); } if (hdr->e_type == ET_DYN && (brand_info->flags & BI_CAN_EXEC_DYN) == 0) return (ENOEXEC); sv = brand_info->sysvec; if (interp != NULL && brand_info->interp_newpath != NULL) newinterp = brand_info->interp_newpath; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ VOP_UNLOCK(imgp->vp, 0, td); error = exec_new_vmspace(imgp, sv); imgp->proc->p_sysent = sv; - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); vmspace = imgp->proc->p_vmspace; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: /* Loadable segment */ prot = 0; if (phdr[i].p_flags & PF_X) prot |= VM_PROT_EXECUTE; if (phdr[i].p_flags & PF_W) prot |= VM_PROT_WRITE; if (phdr[i].p_flags & PF_R) prot |= VM_PROT_READ; #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER) /* * Some x86 binaries assume read == executable, * notably the M3 runtime and therefore cvsup */ if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif if ((error = __elfN(load_section)(vmspace, imgp->object, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr, phdr[i].p_memsz, phdr[i].p_filesz, prot, sv->sv_pagesize)) != 0) return (error); /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff; seg_addr = trunc_page(phdr[i].p_vaddr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr - seg_addr); /* * Is this .text or .data? We can't use * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the * alpha terribly and possibly does other bad * things so we stick to the old way of figuring * it out: If the segment contains the program * entry point, it's a text segment, otherwise it * is a data segment. * * Note that obreak() assumes that data_addr + * data_size == end of data load area, and the ELF * file format expects segments to be sorted by * address. If multiple data segments exist, the * last one will be used. */ if (hdr->e_entry >= phdr[i].p_vaddr && hdr->e_entry < (phdr[i].p_vaddr + phdr[i].p_memsz)) { text_size = seg_size; text_addr = seg_addr; entry = (u_long)hdr->e_entry; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr; break; case PT_NOTE: pnote = &phdr[i]; break; default: break; } } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) || text_size > maxtsiz || total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr + lim_max(imgp->proc, RLIMIT_DATA)); PROC_UNLOCK(imgp->proc); imgp->entry_addr = entry; if (interp != NULL) { int have_interp = FALSE; VOP_UNLOCK(imgp->vp, 0, td); if (brand_info->emul_path != NULL && brand_info->emul_path[0] != '\0') { path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, interp); error = __elfN(load_file)(imgp->proc, path, &addr, &imgp->entry_addr, sv->sv_pagesize); free(path, M_TEMP); if (error == 0) have_interp = TRUE; } if (!have_interp && newinterp != NULL) { error = __elfN(load_file)(imgp->proc, newinterp, &addr, &imgp->entry_addr, sv->sv_pagesize); have_interp = TRUE; } if (!have_interp) { error = __elfN(load_file)(imgp->proc, interp, &addr, &imgp->entry_addr, sv->sv_pagesize); } - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) { uprintf("ELF interpreter %s not found\n", interp); return (error); } } /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; elf_auxargs->trace = elf_trace; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; /* * Try to fetch the osreldate for FreeBSD binary from the ELF * OSABI-note. Only the first page of the image is searched, * the same as for headers. */ if (pnote != NULL && pnote->p_offset < PAGE_SIZE && pnote->p_offset + pnote->p_filesz < PAGE_SIZE ) { note = (const Elf_Note *)(imgp->image_header + pnote->p_offset); if (!aligned(note, Elf32_Addr)) { free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; return (ENOEXEC); } note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset + pnote->p_filesz); while (note < note_end) { if (note->n_namesz == sizeof(FREEBSD_ABI_VENDOR) && note->n_descsz == sizeof(int32_t) && note->n_type == 1 /* ABI_NOTETYPE */) { note_name = (const char *)(note + 1); if (strncmp(FREEBSD_ABI_VENDOR, note_name, sizeof(FREEBSD_ABI_VENDOR)) == 0) { imgp->proc->p_osrel = *(const int32_t *) (note_name + round_page_ps(sizeof(FREEBSD_ABI_VENDOR), sizeof(Elf32_Addr))); break; } } note = (const Elf_Note *)((const char *)(note + 1) + round_page_ps(note->n_namesz, sizeof(Elf32_Addr)) + round_page_ps(note->n_descsz, sizeof(Elf32_Addr))); } } return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Addr *base; Elf_Addr *pos; base = (Elf_Addr *)*stack_base; pos = base + (imgp->args->argc + imgp->args->envc + 2); if (args->trace) { AUXARGS_ENTRY(pos, AT_DEBUG, 1); } if (args->execfd != -1) { AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); } AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static void each_writable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *, int, void *, size_t); static void __elfN(puthdr)(struct thread *, void *, size_t *, int); static void __elfN(putnote)(void *, size_t *, const char *, int, const void *, size_t); int __elfN(coredump)(td, vp, limit) struct thread *td; struct vnode *vp; off_t limit; { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; void *hdr; size_t hdrsize; /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_writable_segment(td, cb_size_segment, &seginfo); /* * Calculate the size of the core file header area by making * a dry run of generating it. Nothing is written, but the * size is calculated. */ hdrsize = 0; __elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count); if (hdrsize + seginfo.size >= limit) return (EFAULT); /* * Allocate memory for building the header, fill it up, * and write it out. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); if (hdr == NULL) { return (EINVAL); } error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = hdrsize; for (i = 0; i < seginfo.count; i++) { error = vn_rdwr_inchunks(UIO_WRITE, vp, (caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, UIO_USERSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, curthread); /* XXXKSE */ if (error != 0) break; offset += php->p_filesz; php++; } } free(hdr, M_TEMP); return (error); } /* * A callback for each_writable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(entry, closure) vm_map_entry_t entry; void *closure; { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = 0; if (entry->protection & VM_PROT_READ) phdr->p_flags |= PF_R; if (entry->protection & VM_PROT_WRITE) phdr->p_flags |= PF_W; if (entry->protection & VM_PROT_EXECUTE) phdr->p_flags |= PF_X; phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_writable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(entry, closure) vm_map_entry_t entry; void *closure; { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_writable_segment(td, func, closure) struct thread *td; segment_callback func; void *closure; { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; vm_object_t backing_object, object; boolean_t ignore_entry; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((object = entry->object.vm_object) == NULL) continue; /* Ignore memory-mapped devices and such things. */ VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); VM_OBJECT_UNLOCK(object); object = backing_object; } ignore_entry = object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE; VM_OBJECT_UNLOCK(object); if (ignore_entry) continue; (*func)(entry, closure); } vm_map_unlock_read(map); } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize) struct thread *td; struct vnode *vp; struct ucred *cred; int numsegs; size_t hdrsize; void *hdr; { size_t off; /* Fill in the header. */ bzero(hdr, hdrsize); off = 0; __elfN(puthdr)(td, hdr, &off, numsegs); /* Write it to the core file. */ return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, td)); /* XXXKSE */ } #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; #endif static void __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs) { struct { elf_prstatus_t status; elf_prfpregset_t fpregset; elf_prpsinfo_t psinfo; } *tempdata; elf_prstatus_t *status; elf_prfpregset_t *fpregset; elf_prpsinfo_t *psinfo; struct proc *p; struct thread *thr; size_t ehoff, noteoff, notesz, phoff; p = td->td_proc; ehoff = *off; *off += sizeof(Elf_Ehdr); phoff = *off; *off += (numsegs + 1) * sizeof(Elf_Phdr); noteoff = *off; /* * Don't allocate space for the notes if we're just calculating * the size of the header. We also don't collect the data. */ if (dst != NULL) { tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK); status = &tempdata->status; fpregset = &tempdata->fpregset; psinfo = &tempdata->psinfo; } else { tempdata = NULL; status = NULL; fpregset = NULL; psinfo = NULL; } if (dst != NULL) { psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, td->td_name, sizeof(psinfo->pr_fname)); /* * XXX - We don't fill in the command line arguments properly * yet. */ strlcpy(psinfo->pr_psargs, td->td_name, sizeof(psinfo->pr_psargs)); } __elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo, sizeof *psinfo); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { if (dst != NULL) { status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = p->p_sig; status->pr_pid = thr->td_tid; #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 fill_regs32(thr, &status->pr_reg); fill_fpregs32(thr, fpregset); #else fill_regs(thr, &status->pr_reg); fill_fpregs(thr, fpregset); #endif } __elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status, sizeof *status); __elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset, sizeof *fpregset); /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ __elfN(dump_thread)(thr, dst, off); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } notesz = *off - noteoff; if (dst != NULL) free(tempdata, M_TEMP); /* Align up to a page boundary for the program segments. */ *off = round_page(*off); if (dst != NULL) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; struct phdr_closure phc; /* * Fill in the ELF header. */ ehdr = (Elf_Ehdr *)((char *)dst + ehoff); ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 ehdr->e_machine = EM_386; #else ehdr->e_machine = ELF_ARCH; #endif ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = phoff; ehdr->e_flags = 0; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_phnum = numsegs + 1; ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shnum = 0; ehdr->e_shstrndx = SHN_UNDEF; /* * Fill in the program header entries. */ phdr = (Elf_Phdr *)((char *)dst + phoff); /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = noteoff; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 0; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = *off; each_writable_segment(td, cb_put_phdr, &phc); } } static void __elfN(putnote)(void *dst, size_t *off, const char *name, int type, const void *desc, size_t descsz) { Elf_Note note; note.n_namesz = strlen(name) + 1; note.n_descsz = descsz; note.n_type = type; if (dst != NULL) bcopy(¬e, (char *)dst + *off, sizeof note); *off += sizeof note; if (dst != NULL) bcopy(name, (char *)dst + *off, note.n_namesz); *off += roundup2(note.n_namesz, sizeof(Elf_Size)); if (dst != NULL) bcopy(desc, (char *)dst + *off, note.n_descsz); *off += roundup2(note.n_descsz, sizeof(Elf_Size)); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { __CONCAT(exec_, __elfN(imgact)), __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); Index: head/sys/kern/imgact_gzip.c =================================================================== --- head/sys/kern/imgact_gzip.c (revision 175201) +++ head/sys/kern/imgact_gzip.c (revision 175202) @@ -1,403 +1,403 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- */ /* * This module handles execution of a.out files which have been run through * "gzip". This saves diskspace, but wastes cpu-cycles and VM. * * TODO: * text-segments should be made R/O after being filled * is the vm-stuff safe ? * should handle the entire header of gzip'ed stuff. * inflate isn't quite reentrant yet... * error-handling is a mess... * so is the rest... * tidy up unnecesary includes */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct imgact_gzip { struct image_params *ip; struct exec a_out; int error; int gotheader; int where; u_char *inbuf; u_long offset; u_long output; u_long len; int idx; u_long virtual_offset, file_offset, file_end, bss_size; }; static int exec_gzip_imgact(struct image_params *imgp); static int NextByte(void *vp); static int do_aout_hdr(struct imgact_gzip *); static int Flush(void *vp, u_char *, u_long siz); static int exec_gzip_imgact(imgp) struct image_params *imgp; { int error, error2 = 0; const u_char *p = (const u_char *) imgp->image_header; struct imgact_gzip igz; struct inflate infl; struct vmspace *vmspace; /* If these four are not OK, it isn't a gzip file */ if (p[0] != 0x1f) return -1; /* 0 Simply magic */ if (p[1] != 0x8b) return -1; /* 1 Simply magic */ if (p[2] != 0x08) return -1; /* 2 Compression method */ if (p[9] != 0x03) return -1; /* 9 OS compressed on */ /* * If this one contains anything but a comment or a filename marker, * we don't want to chew on it */ if (p[3] & ~(0x18)) return ENOEXEC; /* 3 Flags */ /* These are of no use to us */ /* 4-7 Timestamp */ /* 8 Extra flags */ bzero(&igz, sizeof igz); bzero(&infl, sizeof infl); infl.gz_private = (void *) &igz; infl.gz_input = NextByte; infl.gz_output = Flush; igz.ip = imgp; igz.idx = 10; if (p[3] & 0x08) { /* skip a filename */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } if (p[3] & 0x10) { /* skip a comment */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } igz.len = imgp->attr->va_size; error = inflate(&infl); /* * The unzipped file may not even have been long enough to contain * a header giving Flush() a chance to return error. Check for this. */ if ( !igz.gotheader ) return ENOEXEC; if ( !error ) { vmspace = imgp->proc->p_vmspace; error = vm_map_protect(&vmspace->vm_map, (vm_offset_t) vmspace->vm_taddr, (vm_offset_t) (vmspace->vm_taddr + (vmspace->vm_tsize << PAGE_SHIFT)) , VM_PROT_READ|VM_PROT_EXECUTE,0); } if (igz.inbuf) { error2 = vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, (vm_offset_t) igz.inbuf + PAGE_SIZE); } if (igz.error || error || error2) { printf("Output=%lu ", igz.output); printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", error, igz.error, error2, igz.where); } if (igz.error) return igz.error; if (error) return ENOEXEC; if (error2) return error2; return 0; } static int do_aout_hdr(struct imgact_gzip * gz) { int error; struct thread *td = curthread; struct vmspace *vmspace; vm_offset_t vmaddr; /* * Set file/virtual offset based on a.out variant. We do two cases: * host byte order and network byte order (for NetBSD compatibility) */ switch ((int) (gz->a_out.a_magic & 0xffff)) { case ZMAGIC: gz->virtual_offset = 0; if (gz->a_out.a_text) { gz->file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ gz->file_offset = 0; } break; case QMAGIC: gz->virtual_offset = PAGE_SIZE; gz->file_offset = 0; break; default: /* NetBSD compatibility */ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: gz->virtual_offset = PAGE_SIZE; gz->file_offset = 0; break; default: gz->where = __LINE__; return (-1); } } gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if ( /* entry point must lay with text region */ gz->a_out.a_entry < gz->virtual_offset || gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || /* text and data size must each be page rounded */ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) { gz->where = __LINE__; return (-1); } /* * text/data/bss must not exceed limits */ PROC_LOCK(gz->ip->proc); if ( /* text can't exceed maximum text size */ gz->a_out.a_text > maxtsiz || /* data + bss can't exceed rlimit */ gz->a_out.a_data + gz->bss_size > lim_cur(gz->ip->proc, RLIMIT_DATA)) { PROC_UNLOCK(gz->ip->proc); gz->where = __LINE__; return (ENOMEM); } PROC_UNLOCK(gz->ip->proc); /* Find out how far we should go */ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ VOP_UNLOCK(gz->ip->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ error = exec_new_vmspace(gz->ip, &aout_sysvec); - vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY); if (error) { gz->where = __LINE__; return (error); } vmspace = gz->ip->proc->p_vmspace; vmaddr = gz->virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, gz->a_out.a_text + gz->a_out.a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED, OBJT_DEFAULT, NULL, 0); if (error) { gz->where = __LINE__; return (error); } if (gz->bss_size != 0) { /* * Allocate demand-zeroed area for uninitialized data. * "bss" = 'block started by symbol' - named after the * IBM 7090 instruction of the same name. */ vmaddr = gz->virtual_offset + gz->a_out.a_text + gz->a_out.a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { gz->where = __LINE__; return (error); } } /* Fill in process VM information */ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (gz->virtual_offset + gz->a_out.a_text); /* Fill in image_params */ gz->ip->interpreted = 0; gz->ip->entry_addr = gz->a_out.a_entry; gz->ip->proc->p_sysent = &aout_sysvec; return 0; } static int NextByte(void *vp) { int error; struct imgact_gzip *igz = (struct imgact_gzip *) vp; if (igz->idx >= igz->len) { igz->where = __LINE__; return GZ_EOF; } if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { return igz->inbuf[(igz->idx++) - igz->offset]; } if (igz->inbuf) { error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, (vm_offset_t) igz->inbuf + PAGE_SIZE); if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } } igz->offset = igz->idx & ~PAGE_MASK; error = vm_mmap(kernel_map, /* map */ (vm_offset_t *) & igz->inbuf, /* address */ PAGE_SIZE, /* size */ VM_PROT_READ, /* protection */ VM_PROT_READ, /* max protection */ 0, /* flags */ OBJT_VNODE, /* handle type */ igz->ip->vp, /* vnode */ igz->offset); /* offset */ if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } return igz->inbuf[(igz->idx++) - igz->offset]; } static int Flush(void *vp, u_char * ptr, u_long siz) { struct imgact_gzip *gz = (struct imgact_gzip *) vp; u_char *p = ptr, *q; int i; /* First, find an a.out-header. */ if (gz->output < sizeof gz->a_out) { q = (u_char *) & gz->a_out; i = min(siz, sizeof gz->a_out - gz->output); bcopy(p, q + gz->output, i); gz->output += i; p += i; siz -= i; if (gz->output == sizeof gz->a_out) { gz->gotheader = 1; i = do_aout_hdr(gz); if (i == -1) { if (!gz->where) gz->where = __LINE__; gz->error = ENOEXEC; return ENOEXEC; } else if (i) { gz->where = __LINE__; gz->error = i; return ENOEXEC; } if (gz->file_offset == 0) { q = (u_char *) (uintptr_t) gz->virtual_offset; copyout(&gz->a_out, q, sizeof gz->a_out); } } } /* Skip over zero-padded first PAGE if needed */ if (gz->output < gz->file_offset && gz->output + siz > gz->file_offset) { i = min(siz, gz->file_offset - gz->output); gz->output += i; p += i; siz -= i; } if (gz->output >= gz->file_offset && gz->output < gz->file_end) { i = min(siz, gz->file_end - gz->output); q = (u_char *) (uintptr_t) (gz->virtual_offset + gz->output - gz->file_offset); copyout(p, q, i); gz->output += i; p += i; siz -= i; } gz->output += siz; return 0; } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; EXEC_SET(execgzip, gzip_execsw); Index: head/sys/kern/kern_alq.c =================================================================== --- head/sys/kern/kern_alq.c (revision 175201) +++ head/sys/kern/kern_alq.c (revision 175202) @@ -1,519 +1,519 @@ /*- * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Async. Logging Queue */ struct alq { int aq_entmax; /* Max entries */ int aq_entlen; /* Entry length */ char *aq_entbuf; /* Buffer for stored entries */ int aq_flags; /* Queue flags */ struct mtx aq_mtx; /* Queue lock */ struct vnode *aq_vp; /* Open vnode handle */ struct ucred *aq_cred; /* Credentials of the opening thread */ struct ale *aq_first; /* First ent */ struct ale *aq_entfree; /* First free ent */ struct ale *aq_entvalid; /* First ent valid for writing */ LIST_ENTRY(alq) aq_act; /* List of active queues */ LIST_ENTRY(alq) aq_link; /* List of all queues */ }; #define AQ_WANTED 0x0001 /* Wakeup sleeper when io is done */ #define AQ_ACTIVE 0x0002 /* on the active list */ #define AQ_FLUSHING 0x0004 /* doing IO */ #define AQ_SHUTDOWN 0x0008 /* Queue no longer valid */ #define ALQ_LOCK(alq) mtx_lock_spin(&(alq)->aq_mtx) #define ALQ_UNLOCK(alq) mtx_unlock_spin(&(alq)->aq_mtx) static MALLOC_DEFINE(M_ALD, "ALD", "ALD"); /* * The ald_mtx protects the ald_queues list and the ald_active list. */ static struct mtx ald_mtx; static LIST_HEAD(, alq) ald_queues; static LIST_HEAD(, alq) ald_active; static int ald_shutingdown = 0; struct thread *ald_thread; static struct proc *ald_proc; #define ALD_LOCK() mtx_lock(&ald_mtx) #define ALD_UNLOCK() mtx_unlock(&ald_mtx) /* Daemon functions */ static int ald_add(struct alq *); static int ald_rem(struct alq *); static void ald_startup(void *); static void ald_daemon(void); static void ald_shutdown(void *, int); static void ald_activate(struct alq *); static void ald_deactivate(struct alq *); /* Internal queue functions */ static void alq_shutdown(struct alq *); static int alq_doio(struct alq *); /* * Add a new queue to the global list. Fail if we're shutting down. */ static int ald_add(struct alq *alq) { int error; error = 0; ALD_LOCK(); if (ald_shutingdown) { error = EBUSY; goto done; } LIST_INSERT_HEAD(&ald_queues, alq, aq_link); done: ALD_UNLOCK(); return (error); } /* * Remove a queue from the global list unless we're shutting down. If so, * the ald will take care of cleaning up it's resources. */ static int ald_rem(struct alq *alq) { int error; error = 0; ALD_LOCK(); if (ald_shutingdown) { error = EBUSY; goto done; } LIST_REMOVE(alq, aq_link); done: ALD_UNLOCK(); return (error); } /* * Put a queue on the active list. This will schedule it for writing. */ static void ald_activate(struct alq *alq) { LIST_INSERT_HEAD(&ald_active, alq, aq_act); wakeup(&ald_active); } static void ald_deactivate(struct alq *alq) { LIST_REMOVE(alq, aq_act); alq->aq_flags &= ~AQ_ACTIVE; } static void ald_startup(void *unused) { mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET); LIST_INIT(&ald_queues); LIST_INIT(&ald_active); } static void ald_daemon(void) { int needwakeup; struct alq *alq; ald_thread = FIRST_THREAD_IN_PROC(ald_proc); EVENTHANDLER_REGISTER(shutdown_pre_sync, ald_shutdown, NULL, SHUTDOWN_PRI_FIRST); ALD_LOCK(); for (;;) { while ((alq = LIST_FIRST(&ald_active)) == NULL) msleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0); ALQ_LOCK(alq); ald_deactivate(alq); ALD_UNLOCK(); needwakeup = alq_doio(alq); ALQ_UNLOCK(alq); if (needwakeup) wakeup(alq); ALD_LOCK(); } } static void ald_shutdown(void *arg, int howto) { struct alq *alq; ALD_LOCK(); ald_shutingdown = 1; while ((alq = LIST_FIRST(&ald_queues)) != NULL) { LIST_REMOVE(alq, aq_link); ALD_UNLOCK(); alq_shutdown(alq); ALD_LOCK(); } ALD_UNLOCK(); } static void alq_shutdown(struct alq *alq) { ALQ_LOCK(alq); /* Stop any new writers. */ alq->aq_flags |= AQ_SHUTDOWN; /* Drain IO */ while (alq->aq_flags & (AQ_FLUSHING|AQ_ACTIVE)) { alq->aq_flags |= AQ_WANTED; ALQ_UNLOCK(alq); tsleep(alq, PWAIT, "aldclose", 0); ALQ_LOCK(alq); } ALQ_UNLOCK(alq); vn_close(alq->aq_vp, FWRITE, alq->aq_cred, curthread); crfree(alq->aq_cred); } /* * Flush all pending data to disk. This operation will block. */ static int alq_doio(struct alq *alq) { struct thread *td; struct mount *mp; struct vnode *vp; struct uio auio; struct iovec aiov[2]; struct ale *ale; struct ale *alstart; int totlen; int iov; int vfslocked; vp = alq->aq_vp; td = curthread; totlen = 0; iov = 0; alstart = ale = alq->aq_entvalid; alq->aq_entvalid = NULL; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); do { if (aiov[iov].iov_base == NULL) aiov[iov].iov_base = ale->ae_data; aiov[iov].iov_len += alq->aq_entlen; totlen += alq->aq_entlen; /* Check to see if we're wrapping the buffer */ if (ale->ae_data + alq->aq_entlen != ale->ae_next->ae_data) iov++; ale->ae_flags &= ~AE_VALID; ale = ale->ae_next; } while (ale->ae_flags & AE_VALID); alq->aq_flags |= AQ_FLUSHING; ALQ_UNLOCK(alq); if (iov == 2 || aiov[iov].iov_base == NULL) iov--; auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_iovcnt = iov + 1; auio.uio_resid = totlen; auio.uio_td = td; /* * Do all of the junk required to write now. */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_LEASE(vp, td, alq->aq_cred, LEASE_WRITE); /* * XXX: VOP_WRITE error checks are ignored. */ #ifdef MAC if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0) #endif VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); ALQ_LOCK(alq); alq->aq_flags &= ~AQ_FLUSHING; if (alq->aq_entfree == NULL) alq->aq_entfree = alstart; if (alq->aq_flags & AQ_WANTED) { alq->aq_flags &= ~AQ_WANTED; return (1); } return(0); } static struct kproc_desc ald_kp = { "ALQ Daemon", ald_daemon, &ald_proc }; SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp) SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL) /* User visible queue functions */ /* * Create the queue data structure, allocate the buffer, and open the file. */ int alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int count) { struct thread *td; struct nameidata nd; struct ale *ale; struct ale *alp; struct alq *alq; char *bufp; int flags; int error; int i, vfslocked; *alqp = NULL; td = curthread; NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, file, td); flags = FWRITE | O_NOFOLLOW | O_CREAT; error = vn_open_cred(&nd, &flags, cmode, cred, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* We just unlock so we hold a reference */ VOP_UNLOCK(nd.ni_vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO); alq->aq_entbuf = malloc(count * size, M_ALD, M_WAITOK|M_ZERO); alq->aq_first = malloc(sizeof(*ale) * count, M_ALD, M_WAITOK|M_ZERO); alq->aq_vp = nd.ni_vp; alq->aq_cred = crhold(cred); alq->aq_entmax = count; alq->aq_entlen = size; alq->aq_entfree = alq->aq_first; mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET); bufp = alq->aq_entbuf; ale = alq->aq_first; alp = NULL; /* Match up entries with buffers */ for (i = 0; i < count; i++) { if (alp) alp->ae_next = ale; ale->ae_data = bufp; alp = ale; ale++; bufp += size; } alp->ae_next = alq->aq_first; if ((error = ald_add(alq)) != 0) return (error); *alqp = alq; return (0); } /* * Copy a new entry into the queue. If the operation would block either * wait or return an error depending on the value of waitok. */ int alq_write(struct alq *alq, void *data, int waitok) { struct ale *ale; if ((ale = alq_get(alq, waitok)) == NULL) return (EWOULDBLOCK); bcopy(data, ale->ae_data, alq->aq_entlen); alq_post(alq, ale); return (0); } struct ale * alq_get(struct alq *alq, int waitok) { struct ale *ale; struct ale *aln; ale = NULL; ALQ_LOCK(alq); /* Loop until we get an entry or we're shutting down */ while ((alq->aq_flags & AQ_SHUTDOWN) == 0 && (ale = alq->aq_entfree) == NULL && (waitok & ALQ_WAITOK)) { alq->aq_flags |= AQ_WANTED; ALQ_UNLOCK(alq); tsleep(alq, PWAIT, "alqget", 0); ALQ_LOCK(alq); } if (ale != NULL) { aln = ale->ae_next; if ((aln->ae_flags & AE_VALID) == 0) alq->aq_entfree = aln; else alq->aq_entfree = NULL; } else ALQ_UNLOCK(alq); return (ale); } void alq_post(struct alq *alq, struct ale *ale) { int activate; ale->ae_flags |= AE_VALID; if (alq->aq_entvalid == NULL) alq->aq_entvalid = ale; if ((alq->aq_flags & AQ_ACTIVE) == 0) { alq->aq_flags |= AQ_ACTIVE; activate = 1; } else activate = 0; ALQ_UNLOCK(alq); if (activate) { ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); } } void alq_flush(struct alq *alq) { int needwakeup = 0; ALD_LOCK(); ALQ_LOCK(alq); if (alq->aq_flags & AQ_ACTIVE) { ald_deactivate(alq); ALD_UNLOCK(); needwakeup = alq_doio(alq); } else ALD_UNLOCK(); ALQ_UNLOCK(alq); if (needwakeup) wakeup(alq); } /* * Flush remaining data, close the file and free all resources. */ void alq_close(struct alq *alq) { /* * If we're already shuting down someone else will flush and close * the vnode. */ if (ald_rem(alq) != 0) return; /* * Drain all pending IO. */ alq_shutdown(alq); mtx_destroy(&alq->aq_mtx); free(alq->aq_first, M_ALD); free(alq->aq_entbuf, M_ALD); free(alq, M_ALD); } Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 175201) +++ head/sys/kern/kern_descrip.c (revision 175202) @@ -1,2868 +1,2868 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); static uma_zone_t file_zone; /* How to treat 'new' parameter when allocating a fd for do_dup(). */ enum dup_type { DUP_VARIABLE, DUP_FIXED }; static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval); static int fd_first_free(struct filedesc *, int, int); static int fd_last_used(struct filedesc *, int, int); static void fdgrowtable(struct filedesc *, int); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); /* * A process is initially started out with NDFILE descriptors stored within * this structure, selected to be enough for typical applications based on * the historical limit of 20 open files (and the usage of descriptors by * shells). If these descriptors are exhausted, a larger descriptor table * may be allocated, up to a process' resource limit; the internal arrays * are then unused. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * Storage required per open file descriptor. */ #define OFILESIZE (sizeof(struct file *) + sizeof(char)) /* * Basic allocation of descriptors: * one of the above, plus arrays for NDFILE descriptors. */ struct filedesc0 { struct filedesc fd_fd; /* * These arrays are used when the number of open files is * <= NDFILE, and are then pointed to by the pointers above. */ struct file *fd_dfiles[NDFILE]; char fd_dfileflags[NDFILE]; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ volatile int openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* A mutex to protect the association between a proc and filedesc. */ static struct mtx fdesc_mtx; /* * Find the first zero bit in the given bitmap, starting at low and not * exceeding size - 1. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at low and * not exceeding size - 1. */ static int fd_last_used(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; if (low >= size) return (-1); off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(low); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (low - 1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(!fdisused(fdp, fd), ("fd already used")); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd is already unused")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("fd is still in use")); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, 0, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int getdtablesize(struct thread *td, struct getdtablesize_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int dup2(struct thread *td, struct dup2_args *uap) { return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, td->td_retval)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int dup(struct thread *td, struct dup_args *uap) { return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int fcntl(struct thread *td, struct fcntl_args *uap) { struct flock fl; intptr_t arg; int error; error = 0; switch (uap->cmd) { case F_GETLK: case F_SETLK: case F_SETLKW: error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); arg = (intptr_t)&fl; break; default: arg = uap->arg; break; } if (error) return (error); error = kern_fcntl(td, uap->fd, uap->cmd, arg); if (error) return (error); if (uap->cmd == F_GETLK) error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); return (error); } static inline struct file * fdtofp(int fd, struct filedesc *fdp) { struct file *fp; FILEDESC_LOCK_ASSERT(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) return (NULL); return (fp); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp; struct proc *p; char *pop; struct vnode *vp; u_int newmin; int error, flg, tmp; int vfslocked; vfslocked = 0; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; switch (cmd) { case F_DUPFD: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } FILEDESC_SUNLOCK(fdp); newmin = arg; PROC_LOCK(p); if (newmin >= lim_cur(p, RLIMIT_NOFILE) || newmin >= maxfilesperproc) { PROC_UNLOCK(p); error = EINVAL; break; } PROC_UNLOCK(p); error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval); break; case F_GETFD: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } pop = &fdp->fd_ofileflags[fd]; td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; FILEDESC_SUNLOCK(fdp); break; case F_SETFD: FILEDESC_XLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_XUNLOCK(fdp); error = EBADF; break; } pop = &fdp->fd_ofileflags[fd]; *pop = (*pop &~ UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); FILEDESC_XUNLOCK(fdp); break; case F_GETFL: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } td->td_retval[0] = OFLAGS(fp->f_flag); FILEDESC_SUNLOCK(fdp); break; case F_SETFL: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } fhold(fp); FILEDESC_SUNLOCK(fdp); do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } fhold(fp); FILEDESC_SUNLOCK(fdp); error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } fhold(fp); FILEDESC_SUNLOCK(fdp); tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } if (fp->f_type != DTYPE_VNODE) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { if (fp->f_offset < 0 || (flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start)) { FILEDESC_SUNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_SUNLOCK(fdp); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); break; default: error = EINVAL; break; } VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* Check for race with close */ FILEDESC_SLOCK(fdp); if ((unsigned) fd >= fdp->fd_nfiles || fp != fdp->fd_ofiles[fd]) { FILEDESC_SUNLOCK(fdp); flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; vfslocked = VFS_LOCK_GIANT(vp->v_mount); (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; } else FILEDESC_SUNLOCK(fdp); fdrop(fp, td); break; case F_GETLK: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } if (fp->f_type != DTYPE_VNODE) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { FILEDESC_SUNLOCK(fdp); error = EINVAL; break; } if (flp->l_whence == SEEK_CUR) { if ((flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && fp->f_offset < OFF_MIN - flp->l_start)) { FILEDESC_SUNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_SUNLOCK(fdp); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; fdrop(fp, td); break; default: error = EINVAL; break; } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Common code for dup, dup2, and fcntl(F_DUPFD). */ static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval) { struct filedesc *fdp; struct proc *p; struct file *fp; struct file *delfp; int error, holdleaders, maxfd; KASSERT((type == DUP_VARIABLE || type == DUP_FIXED), ("invalid dup type %d", type)); p = td->td_proc; fdp = p->p_fd; /* * Verify we have a valid descriptor to dup from and possibly to * dup to. */ if (old < 0 || new < 0) return (EBADF); PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if (new >= maxfd) return (EMFILE); FILEDESC_XLOCK(fdp); if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } if (type == DUP_FIXED && old == new) { *retval = new; FILEDESC_XUNLOCK(fdp); return (0); } fp = fdp->fd_ofiles[old]; fhold(fp); /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. Since the filedesc * lock may be temporarily dropped in the process, we have to look * out for a race. */ if (type == DUP_FIXED) { if (new >= fdp->fd_nfiles) fdgrowtable(fdp, new + 1); if (fdp->fd_ofiles[new] == NULL) fdused(fdp, new); } else { if ((error = fdalloc(td, new, &new)) != 0) { FILEDESC_XUNLOCK(fdp); fdrop(fp, td); return (error); } } /* * If the old file changed out from under us then treat it as a * bad file descriptor. Userland should do its own locking to * avoid this case. */ if (fdp->fd_ofiles[old] != fp) { /* we've allocated a descriptor which we won't use */ if (fdp->fd_ofiles[new] == NULL) fdunused(fdp, new); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); return (EBADF); } KASSERT(old != new, ("new fd is same as old")); /* * Save info on the descriptor being overwritten. We cannot close * it without introducing an ownership race for the slot, since we * need to drop the filedesc lock to call closef(). * * XXX this duplicates parts of close(). */ delfp = fdp->fd_ofiles[new]; holdleaders = 0; if (delfp != NULL) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } } /* * Duplicate the source descriptor */ fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; /* * If we dup'd over a valid file, we now own the reference to it * and must dispose of it using closef() semantics (as if a * close() were performed on it). * * XXX this duplicates parts of close(). */ if (delfp != NULL) { knote_fdclose(td, new); if (delfp->f_type == DTYPE_MQUEUE) mq_fdclose(td, new, delfp); FILEDESC_XUNLOCK(fdp); (void) closef(delfp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } } else { FILEDESC_XUNLOCK(fdp); } return (0); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int close(td, uap) struct thread *td; struct close_args *uap; { return (kern_close(td, uap->fd)); } int kern_close(td, fd) struct thread *td; int fd; { struct filedesc *fdp; struct file *fp; int error; int holdleaders; error = 0; holdleaders = 0; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_XLOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; fdunused(fdp, fd); if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG(fd, fd); if ((error = fget(td, fd, &fp)) != 0) return (error); AUDIT_ARG(file, td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int nfstat(struct thread *td, struct nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int fpathconf(struct thread *td, struct fpathconf_args *uap) { struct file *fp; struct vnode *vp; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); /* If asynchronous I/O is available, it works for all descriptors. */ if (uap->name == _PC_ASYNC_IO) { td->td_retval[0] = async_io_version; goto out; } vp = fp->f_vnode; if (vp != NULL) { int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_PATHCONF(vp, uap->name, td->td_retval); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (uap->name != _PC_PIPE_BUF) { error = EINVAL; } else { td->td_retval[0] = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Grow the file table to accomodate (at least) nfd descriptors. This may * block and drop the filedesc lock, but it will reacquire it before * returning. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct file **ntable; char *nfileflags; int nnfiles, onfiles; NDSLOTTYPE *nmap; FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* compute the size of the new table */ onfiles = fdp->fd_nfiles; nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* allocate a new table and (if required) new bitmaps */ FILEDESC_XUNLOCK(fdp); MALLOC(ntable, struct file **, nnfiles * OFILESIZE, M_FILEDESC, M_ZERO | M_WAITOK); nfileflags = (char *)&ntable[nnfiles]; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); else nmap = NULL; FILEDESC_XLOCK(fdp); /* * We now have new tables ready to go. Since we dropped the * filedesc lock to call malloc(), watch out for a race. */ onfiles = fdp->fd_nfiles; if (onfiles >= nnfiles) { /* we lost the race, but that's OK */ free(ntable, M_FILEDESC); if (nmap != NULL) free(nmap, M_FILEDESC); return; } bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); bcopy(fdp->fd_ofileflags, nfileflags, onfiles); if (onfiles > NDFILE) free(fdp->fd_ofiles, M_FILEDESC); fdp->fd_ofiles = ntable; fdp->fd_ofileflags = nfileflags; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); fdp->fd_map = nmap; } fdp->fd_nfiles = nnfiles; } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd = -1, maxfd; FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); /* * Search the bitmap for a free descriptor. If none is found, try * to grow the file table. Keep at it until we either get a file * descriptor or run into process or system limits; fdgrowtable() * may drop the filedesc lock, so we're in a race. */ for (;;) { fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd < fdp->fd_nfiles) break; fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("free descriptor isn't")); fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ fdused(fdp, fd); *result = fd; return (0); } /* * Check to see whether n user file descriptors are available to the process * p. */ int fdavail(struct thread *td, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = td->td_proc->p_fd; struct file **fpp; int i, lim, last; FILEDESC_LOCK_ASSERT(fdp); PROC_LOCK(p); lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); last = min(fdp->fd_nfiles, lim); fpp = &fdp->fd_ofiles[fdp->fd_freefile]; for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { if (*fpp == NULL && --n <= 0) return (1); } return (0); } /* * Create a new open file structure and allocate a file decriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc(struct thread *td, struct file **resultfp, int *resultfd) { struct proc *p = td->td_proc; struct file *fp; int error, i; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); if ((openfiles >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles >= maxfiles) { if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", td->td_ucred->cr_ruid); } uma_zfree(file_zone, fp); return (ENFILE); } atomic_add_int(&openfiles, 1); /* * If the process has file descriptor zero open, add the new file * descriptor to the list of open files at that point, otherwise * put it at the front of the list of open files. */ fp->f_count = 1; if (resultfp) fp->f_count++; fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; FILEDESC_XLOCK(p->p_fd); if ((error = fdalloc(td, 0, &i))) { FILEDESC_XUNLOCK(p->p_fd); fdrop(fp, td); if (resultfp) fdrop(fp, td); return (error); } p->p_fd->fd_ofiles[i] = fp; FILEDESC_XUNLOCK(p->p_fd); if (resultfp) *resultfp = fp; if (resultfd) *resultfd = i; return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. */ struct filedesc * fdinit(struct filedesc *fdp) { struct filedesc0 *newfdp; newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); FILEDESC_LOCK_INIT(&newfdp->fd_fd); if (fdp != NULL) { FILEDESC_XLOCK(fdp); newfdp->fd_fd.fd_cdir = fdp->fd_cdir; if (newfdp->fd_fd.fd_cdir) VREF(newfdp->fd_fd.fd_cdir); newfdp->fd_fd.fd_rdir = fdp->fd_rdir; if (newfdp->fd_fd.fd_rdir) VREF(newfdp->fd_fd.fd_rdir); newfdp->fd_fd.fd_jdir = fdp->fd_jdir; if (newfdp->fd_fd.fd_jdir) VREF(newfdp->fd_fd.fd_jdir); FILEDESC_XUNLOCK(fdp); } /* Create the file descriptor table. */ newfdp->fd_fd.fd_refcnt = 1; newfdp->fd_fd.fd_holdcnt = 1; newfdp->fd_fd.fd_cmask = CMASK; newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; newfdp->fd_fd.fd_nfiles = NDFILE; newfdp->fd_fd.fd_map = newfdp->fd_dmap; newfdp->fd_fd.fd_lastfile = -1; return (&newfdp->fd_fd); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; mtx_lock(&fdesc_mtx); fdp = p->p_fd; if (fdp != NULL) fdp->fd_holdcnt++; mtx_unlock(&fdesc_mtx); return (fdp); } static void fddrop(struct filedesc *fdp) { int i; mtx_lock(&fdesc_mtx); i = --fdp->fd_holdcnt; mtx_unlock(&fdesc_mtx); if (i > 0) return; FILEDESC_LOCK_DESTROY(fdp); FREE(fdp, M_FILEDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { FILEDESC_XLOCK(fdp); fdp->fd_refcnt++; FILEDESC_XUNLOCK(fdp); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct proc *p, struct thread *td) { FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; FILEDESC_XUNLOCK(p->p_fd); tmp = fdcopy(p->p_fd); fdfree(td); p->p_fd = tmp; } else FILEDESC_XUNLOCK(p->p_fd); } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return (NULL); newfdp = fdinit(fdp); FILEDESC_SLOCK(fdp); while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_SUNLOCK(fdp); FILEDESC_XLOCK(newfdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_XUNLOCK(newfdp); FILEDESC_SLOCK(fdp); } /* copy everything except kqueue descriptors */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { if (fdisused(fdp, i) && fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) { newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fhold(newfdp->fd_ofiles[i]); newfdp->fd_lastfile = i; } else { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; } } FILEDESC_SUNLOCK(fdp); FILEDESC_XLOCK(newfdp); for (i = 0; i <= newfdp->fd_lastfile; ++i) if (newfdp->fd_ofiles[i] != NULL) fdused(newfdp, i); FILEDESC_XUNLOCK(newfdp); FILEDESC_SLOCK(fdp); if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Release a filedesc structure. */ void fdfree(struct thread *td) { struct filedesc *fdp; struct file **fpp; int i, locked; struct filedesc_to_leader *fdtol; struct file *fp; struct vnode *cdir, *jdir, *rdir, *vp; struct flock lf; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* Check for special need to clear POSIX style locks */ fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0, fpp = fdp->fd_ofiles; i <= fdp->fd_lastfile; i++, fpp++) { if (*fpp == NULL || (*fpp)->f_type != DTYPE_VNODE) continue; fp = *fpp; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; locked = VFS_LOCK_GIANT(vp->v_mount); (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc-> p_leader, F_UNLCK, &lf, F_POSIX); VFS_UNLOCK_GIANT(locked); FILEDESC_XLOCK(fdp); fdrop(fp, td); fpp = fdp->fd_ofiles + i; } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or do_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; td->td_proc->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) FREE(fdtol, M_FILEDESC_TO_LEADER); } FILEDESC_XLOCK(fdp); i = --fdp->fd_refcnt; FILEDESC_XUNLOCK(fdp); if (i > 0) return; /* * We are the last reference to the structure, so we can * safely assume it will not change out from under us. */ fpp = fdp->fd_ofiles; for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { if (*fpp) (void) closef(*fpp, td); } FILEDESC_XLOCK(fdp); /* XXX This should happen earlier. */ mtx_lock(&fdesc_mtx); td->td_proc->p_fd = NULL; mtx_unlock(&fdesc_mtx); if (fdp->fd_nfiles > NDFILE) FREE(fdp->fd_ofiles, M_FILEDESC); if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) FREE(fdp->fd_map, M_FILEDESC); fdp->fd_nfiles = 0; cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_XUNLOCK(fdp); if (cdir) { locked = VFS_LOCK_GIANT(cdir->v_mount); vrele(cdir); VFS_UNLOCK_GIANT(locked); } if (rdir) { locked = VFS_LOCK_GIANT(rdir->v_mount); vrele(rdir); VFS_UNLOCK_GIANT(locked); } if (jdir) { locked = VFS_LOCK_GIANT(jdir->v_mount); vrele(jdir); VFS_UNLOCK_GIANT(locked); } fddrop(fdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since setugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static int is_unsafe(struct file *fp) { if (fp->f_type == DTYPE_VNODE) { struct vnode *vp = fp->f_vnode; if ((vp->v_vflag & VV_PROCDEP) != 0) return (1); } return (0); } /* * Make this setguid thing safe, if at all possible. */ void setugidsafety(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* * Note: fdp->fd_ofiles may be reallocated out from under us while * we are blocked in a close. Be careful! */ FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); FILEDESC_XLOCK(fdp); } } FILEDESC_XUNLOCK(fdp); } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) { FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx] == fp) { fdp->fd_ofiles[idx] = NULL; fdunused(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; FILEDESC_XLOCK(fdp); /* * We cannot cache fd_ofiles or fd_ofileflags since operations * may block and rip them out from under us. */ for (i = 0; i <= fdp->fd_lastfile; i++) { if (fdp->fd_ofiles[i] != NULL && (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, i, fp); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); FILEDESC_XLOCK(fdp); } } FILEDESC_XUNLOCK(fdp); } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t retval, save; int i, error, devnull; fdp = td->td_proc->p_fd; if (fdp == NULL) return (0); KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); devnull = -1; error = 0; for (i = 0; i < 3; i++) { if (fdp->fd_ofiles[i] != NULL) continue; if (devnull < 0) { save = td->td_retval[0]; error = kern_open(td, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); devnull = td->td_retval[0]; KASSERT(devnull == i, ("oof, we didn't get our fd")); td->td_retval[0] = save; if (error) break; } else { error = do_dup(td, DUP_FIXED, devnull, i, &retval); if (error != 0) break; } } return (error); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { int vfslocked; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } VFS_UNLOCK_GIANT(vfslocked); } return (fdrop(fp, td)); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist, EBADF is returned. * * If the descriptor exists but doesn't match 'flags' then return EBADF for * read attempts and EINVAL for write attempts. * * If 'hold' is set (non-zero) the file's refcount will be bumped on return. * It should be dropped with fdrop(). If it is not set, then the refcount * will not be bumped however the thread's filedesc struct will be returned * locked (for fgetsock). * * If an error occured the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is set and zero is returned. */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) { struct filedesc *fdp; struct file *fp; *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) return (EBADF); FILEDESC_SLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { FILEDESC_SUNLOCK(fdp); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. * * Only one flag, or 0, may be specified. */ if (flags == FREAD && (fp->f_flag & FREAD) == 0) { FILEDESC_SUNLOCK(fdp); return (EBADF); } if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { FILEDESC_SUNLOCK(fdp); return (EBADF); } if (hold) { fhold(fp); FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, 0, 1)); } int fget_read(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FREAD, 1)); } int fget_write(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FWRITE, 1)); } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) { struct file *fp; int error; *vpp = NULL; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } FILEDESC_SUNLOCK(td->td_proc->p_fd); return (error); } int fgetvp(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, 0)); } int fgetvp_read(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FREAD)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FWRITE)); } #endif /* * Like fget() but loads the underlying socket, or returns an error if the * descriptor does not represent a socket. * * We bump the ref count on the returned socket. XXX Also obtain the SX lock * in the future. * * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely * on their file descriptor reference to prevent the socket from being free'd * during use. */ int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) { struct file *fp; int error; *spp = NULL; if (fflagp != NULL) *fflagp = 0; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; } else { *spp = fp->f_data; if (fflagp) *fflagp = fp->f_flag; SOCK_LOCK(*spp); soref(*spp); SOCK_UNLOCK(*spp); } FILEDESC_SUNLOCK(td->td_proc->p_fd); return (error); } /* * Drop the reference count on the socket and XXX release the SX lock in the * future. The last reference closes the socket. * * XXXRW: fputsock() is deprecated, see comment for fgetsock(). */ void fputsock(struct socket *so) { ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } /* * Handle the last reference to a file being closed. */ int _fdrop(struct file *fp, struct thread *td) { int error; error = 0; if (fp->f_count != 0) panic("fdrop: count %d", fp->f_count); if (fp->f_ops != &badfileops) error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int vfslocked; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) { struct file *wfp; struct file *fp; /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if (dfd < 0 || dfd >= fdp->fd_nfiles || (wfp = fdp->fd_ofiles[dfd]) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. * * Any other error code is just returned. */ switch (error) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { FILEDESC_XUNLOCK(fdp); return (EACCES); } fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = wfp; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; if (fp == NULL) fdused(fdp, indx); fhold(wfp); FILEDESC_XUNLOCK(fdp); if (fp != NULL) /* * We now own the reference to fp that the ofiles[] * array used to own. Release it. */ fdrop(fp, td); return (0); case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; fdp->fd_ofiles[dfd] = NULL; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; fdp->fd_ofileflags[dfd] = 0; fdunused(fdp, dfd); if (fp == NULL) fdused(fdp, indx); FILEDESC_XUNLOCK(fdp); /* * We now own the reference to fp that the ofiles[] array * used to own. Release it. */ if (fp != NULL) fdrop(fp, td); return (0); default: FILEDESC_XUNLOCK(fdp); return (error); } /* NOTREACHED */ } /* * Scan all active processes to see if any of them have a current or root * directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { fdp = fdhold(p); if (fdp == NULL) continue; nrele = 0; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vref(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vref(newdp); fdp->fd_rdir = newdp; nrele++; } FILEDESC_XUNLOCK(fdp); fddrop(fdp); while (nrele--) vrele(olddp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrele(rootvnode); vref(newdp); rootvnode = newdp; } } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; MALLOC(fdtol, struct filedesc_to_leader *, sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = fdhold(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ if (fdp->fd_lastfile > 0) n += fdp->fd_lastfile; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; PROC_UNLOCK(p); fdp = fdhold(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n]) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = 0; xf.xf_offset = fp->f_offset; xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { char *fullpath, *freepath; struct kinfo_file *kif; struct filedesc *fdp; int error, i, *name; struct socket *so; struct vnode *vp; struct file *fp; struct proc *p; int vfslocked; name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); if ((error = p_candebug(curthread, p))) { PROC_UNLOCK(p); return (error); } fdp = fdhold(p); PROC_UNLOCK(p); kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; i++) { if ((fp = fdp->fd_ofiles[i]) == NULL) continue; bzero(kif, sizeof(*kif)); kif->kf_structsize = sizeof(*kif); vp = NULL; so = NULL; kif->kf_fd = i; switch (fp->f_type) { case DTYPE_VNODE: kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; break; case DTYPE_SOCKET: kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; break; case DTYPE_PIPE: kif->kf_type = KF_TYPE_PIPE; break; case DTYPE_FIFO: kif->kf_type = KF_TYPE_FIFO; vp = fp->f_vnode; vref(vp); break; case DTYPE_KQUEUE: kif->kf_type = KF_TYPE_KQUEUE; break; case DTYPE_CRYPTO: kif->kf_type = KF_TYPE_CRYPTO; break; case DTYPE_MQUEUE: kif->kf_type = KF_TYPE_MQUEUE; break; default: kif->kf_type = KF_TYPE_UNKNOWN; break; } kif->kf_ref_count = fp->f_count; if (fp->f_flag & FREAD) kif->kf_flags |= KF_FLAG_READ; if (fp->f_flag & FWRITE) kif->kf_flags |= KF_FLAG_WRITE; if (fp->f_flag & FAPPEND) kif->kf_flags |= KF_FLAG_APPEND; if (fp->f_flag & FASYNC) kif->kf_flags |= KF_FLAG_ASYNC; if (fp->f_flag & FFSYNC) kif->kf_flags |= KF_FLAG_FSYNC; if (fp->f_flag & FNONBLOCK) kif->kf_flags |= KF_FLAG_NONBLOCK; if (fp->f_flag & O_DIRECT) kif->kf_flags |= KF_FLAG_DIRECT; if (fp->f_flag & FHASLOCK) kif->kf_flags |= KF_FLAG_HASLOCK; kif->kf_offset = fp->f_offset; if (vp != NULL) { vref(vp); switch (vp->v_type) { case VNON: kif->kf_vnode_type = KF_VTYPE_VNON; break; case VREG: kif->kf_vnode_type = KF_VTYPE_VREG; break; case VDIR: kif->kf_vnode_type = KF_VTYPE_VDIR; break; case VBLK: kif->kf_vnode_type = KF_VTYPE_VBLK; break; case VCHR: kif->kf_vnode_type = KF_VTYPE_VCHR; break; case VLNK: kif->kf_vnode_type = KF_VTYPE_VLNK; break; case VSOCK: kif->kf_vnode_type = KF_VTYPE_VSOCK; break; case VFIFO: kif->kf_vnode_type = KF_VTYPE_VFIFO; break; case VBAD: kif->kf_vnode_type = KF_VTYPE_VBAD; break; default: kif->kf_vnode_type = KF_VTYPE_UNKNOWN; break; } /* * It is OK to drop the filedesc lock here as we will * re-validate and re-evaluate its properties when * the loop continues. */ freepath = NULL; fullpath = "-"; FILEDESC_SUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vn_fullpath(curthread, vp, &fullpath, &freepath); vput(vp); VFS_UNLOCK_GIANT(vfslocked); strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); if (freepath != NULL) free(freepath, M_TEMP); FILEDESC_SLOCK(fdp); } if (so != NULL) { struct sockaddr *sa; if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { bcopy(sa, &kif->kf_sa_local, sa->sa_len); free(sa, M_SONAME); } if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) == 00 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { bcopy(sa, &kif->kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } kif->kf_sock_domain = so->so_proto->pr_domain->dom_family; kif->kf_sock_type = so->so_type; kif->kf_sock_protocol = so->so_proto->pr_protocol; } error = SYSCTL_OUT(req, kif, sizeof(*kif)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(kif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD, sysctl_kern_proc_filedesc, "Process filedesc entries"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnod"); case DTYPE_SOCKET: return ("sock"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kque"); case DTYPE_CRYPTO: return ("crpt"); case DTYPE_MQUEUE: return ("mque"); case DTYPE_SHM: return ("shm"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n]) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { struct proc *p; if (header) db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File", "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, 0, fp->f_count, 0, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n]) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (EBADF); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, }; /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL) Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 175201) +++ head/sys/kern/kern_exec.c (revision 175202) @@ -1,1310 +1,1310 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_mac.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); static void exec_free_args(struct image_args *); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int execve(td, uap) struct thread *td; struct execve_args /* { char *fname; char **argv; char **envv; } */ *uap; { int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int __mac_execve(td, uap) struct thread *td; struct __mac_execve_args /* { char *fname; char **argv; char **envv; struct mac *mac_p; } */ *uap; { #ifdef MAC int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); return (error); #else return (ENOSYS); #endif } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; int error; AUDIT_ARG(argv, args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG(envv, args->begin_envv, args->envc, args->endp - args->begin_envv); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); if (thread_single(SINGLE_BOUNDARY)) { PROC_UNLOCK(p); exec_free_args(args); return (ERESTART); /* Try again later. */ } PROC_UNLOCK(p); } error = do_execve(td, args, mac_p); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(SINGLE_EXIT); else thread_single_end(); PROC_UNLOCK(p); } return (error); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd, *ndp; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip; register_t *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL; int credential_changing; int vfslocked; int textset; #ifdef MAC struct label *interplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif vfslocked = 0; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ imgp->proc = p; imgp->execlabel = NULL; imgp->attr = &attr; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX; imgp->auxargs = NULL; imgp->vp = NULL; imgp->object = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; imgp->args = args; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif imgp->image_header = NULL; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ ndp = &nd; NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); interpret: error = namei(ndp); if (error) goto exec_fail; vfslocked = NDHASGIANT(ndp); imgp->vp = ndp->ni_vp; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = imgp->vp->v_vflag & VV_TEXT; imgp->vp->v_vflag |= VV_TEXT; error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) imgp->vp->v_vflag &= ~VV_TEXT; error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ imgp->vp->v_vflag &= ~VV_TEXT; /* free name buffer and old vnode */ NDFREE(ndp, NDF_ONLY_PNBUF); #ifdef MAC interplabel = mac_vnode_label_alloc(); mac_vnode_copy_label(ndp->ni_vp->v_label, interplabel); #endif vput(ndp->ni_vp); vm_object_deallocate(imgp->object); imgp->object = NULL; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE, UIO_SYSSPACE, imgp->interpreter_name, td); goto interpret; } /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ fdunshare(p, td); /* * Malloc things before we need locks. */ newcred = crget(); euip = uifind(attr.va_uid); i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* close files on exec */ VOP_UNLOCK(imgp->vp, 0, td); fdcloseexec(td); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); /* Get a reference to the vnode prior to locking the proc */ VREF(ndp->ni_vp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ PROC_LOCK(p); if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; PROC_UNLOCK(p); newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); PROC_LOCK(p); p->p_sigacts = newsigacts; } else oldsigacts = NULL; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup(p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ oldcred = p->p_ucred; credential_changing = 0; credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracevp != NULL && priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) { mtx_lock(&ktrace_mtx); p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); } #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); setugidsafety(td); VOP_UNLOCK(imgp->vp, 0, td); error = fdcheckstd(td); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto done1; PROC_LOCK(p); /* * Set the new credentials. */ crcopy(newcred, oldcred); if (attr.va_mode & VSUID) change_euid(newcred, euip); if (attr.va_mode & VSGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, newcred, imgp->vp, interplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { crcopy(newcred, oldcred); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = ndp->ni_vp; /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. * Use tdsignal to deliver signal to current thread, use * psignal may cause the signal to be delivered to wrong thread * because that thread will exit, remember we are going to enter * single thread mode. */ if (p->p_flag & P_TRACED) tdsignal(p, td, SIGTRAP, NULL); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); else exec_setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); vfs_mark_atime(imgp->vp, td); done1: /* * Free any resources malloc'd earlier that we didn't use. */ uifree(euip); if (newcred == NULL) crfree(oldcred); else crfree(newcred); VOP_UNLOCK(imgp->vp, 0, td); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(textvp->v_mount); vrele(textvp); VFS_UNLOCK_GIANT(tvfslocked); } if (ndp->ni_vp && error != 0) vrele(ndp->ni_vp); #ifdef KTRACE if (tracevp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(tvfslocked); } if (tracecred != NULL) crfree(tracecred); #endif - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (oldargs != NULL) pargs_drop(oldargs); if (newargs != NULL) pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(imgp->vp); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); if (error == 0) { /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); done2: #ifdef MAC mac_execve_exit(imgp); if (interplabel != NULL) mac_vnode_label_free(interplabel); #endif VFS_UNLOCK_GIANT(vfslocked); exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ } return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_LOCK(object); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { object->flags |= OBJ_COLORED; object->pg_color = 0; } #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_lookup(object, i)) != NULL) { if (ma[i]->valid) break; if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy) break; vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { vm_page_lock_queues(); vm_page_free(ma[0]); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(object); return (EIO); } } vm_page_lock_queues(); vm_page_hold(ma[0]); vm_page_unlock_queues(); vm_page_wakeup(ma[0]); VM_OBJECT_UNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock_queues(); vm_page_unhold(m); vm_page_unlock_queues(); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_offset_t stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { error = vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ if (sv->sv_maxssiz != NULL) ssiz = *sv->sv_maxssiz; else ssiz = maxssiz; stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); #ifdef __ia64__ /* Allocate a new register stack */ stack_addr = IA64_BACKINGSTORE; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); if (error) return (error); #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { char *argp, *envp; int error; size_t length; error = 0; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate temporary demand zeroed space for argument and * environment strings: * * o ARG_MAX for argument and environment; * o MAXSHELLCMDLEN for the name of interpreters. */ args->buf = (char *) kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); if (args->buf == NULL) return (ENOMEM); args->begin_argv = args->buf; args->endp = args->begin_argv; args->stringspace = ARG_MAX; args->fname = args->buf + ARG_MAX; /* * Copy the file name. */ error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; /* * extract arguments first */ while ((argp = (caddr_t) (intptr_t) fuword(argv++))) { if (argp == (caddr_t) -1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(argp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { if (envp == (caddr_t)-1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(envp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } static void exec_free_args(struct image_args *args) { if (args->buf) { kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); args->buf = NULL; } } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* XXXKSE */ /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred, td); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_jail.c =================================================================== --- head/sys/kern/kern_jail.c (revision 175201) +++ head/sys/kern/kern_jail.c (revision 175202) @@ -1,985 +1,985 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, "Jail rules"); int jail_set_hostname_allowed = 1; SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW, &jail_set_hostname_allowed, 0, "Processes in jail can set their hostnames"); int jail_socket_unixiproute_only = 1; SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW, &jail_socket_unixiproute_only, 0, "Processes in jail are limited to creating UNIX/IPv4/route sockets only"); int jail_sysvipc_allowed = 0; SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW, &jail_sysvipc_allowed, 0, "Processes in jail can use System V IPC primitives"); static int jail_enforce_statfs = 2; SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW, &jail_enforce_statfs, 0, "Processes in jail cannot see all mounted file systems"); int jail_allow_raw_sockets = 0; SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW, &jail_allow_raw_sockets, 0, "Prison root can create raw sockets"); int jail_chflags_allowed = 0; SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW, &jail_chflags_allowed, 0, "Processes in jail can alter system file flags"); int jail_mount_allowed = 0; SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW, &jail_mount_allowed, 0, "Processes in jail can mount/unmount jail-friendly file systems"); /* allprison, lastprid, and prisoncount are protected by allprison_lock. */ struct prisonlist allprison; struct sx allprison_lock; int lastprid = 0; int prisoncount = 0; /* * List of jail services. Protected by allprison_lock. */ TAILQ_HEAD(prison_services_head, prison_service); static struct prison_services_head prison_services = TAILQ_HEAD_INITIALIZER(prison_services); static int prison_service_slots = 0; struct prison_service { prison_create_t ps_create; prison_destroy_t ps_destroy; int ps_slotno; TAILQ_ENTRY(prison_service) ps_next; char ps_name[0]; }; static void init_prison(void *); static void prison_complete(void *context, int pending); static int sysctl_jail_list(SYSCTL_HANDLER_ARGS); static void init_prison(void *data __unused) { sx_init(&allprison_lock, "allprison"); LIST_INIT(&allprison); } SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL); /* * struct jail_args { * struct jail *jail; * }; */ int jail(struct thread *td, struct jail_args *uap) { struct nameidata nd; struct prison *pr, *tpr; struct prison_service *psrv; struct jail j; struct jail_attach_args jaa; int vfslocked, error, tryprid; error = copyin(uap->jail, &j, sizeof(j)); if (error) return (error); if (j.version != 0) return (EINVAL); MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF); pr->pr_ref = 1; error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0); if (error) goto e_killmtx; NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE, pr->pr_path, td); error = namei(&nd); if (error) goto e_killmtx; vfslocked = NDHASGIANT(&nd); pr->pr_root = nd.ni_vp; VOP_UNLOCK(nd.ni_vp, 0, td); NDFREE(&nd, NDF_ONLY_PNBUF); VFS_UNLOCK_GIANT(vfslocked); error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0); if (error) goto e_dropvnref; pr->pr_ip = j.ip_number; pr->pr_linux = NULL; pr->pr_securelevel = securelevel; if (prison_service_slots == 0) pr->pr_slots = NULL; else { pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots, M_PRISON, M_ZERO | M_WAITOK); } /* Determine next pr_id and add prison to allprison list. */ sx_xlock(&allprison_lock); tryprid = lastprid + 1; if (tryprid == JAIL_MAX) tryprid = 1; next: LIST_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_id == tryprid) { tryprid++; if (tryprid == JAIL_MAX) { sx_xunlock(&allprison_lock); error = EAGAIN; goto e_dropvnref; } goto next; } } pr->pr_id = jaa.jid = lastprid = tryprid; LIST_INSERT_HEAD(&allprison, pr, pr_list); prisoncount++; sx_downgrade(&allprison_lock); TAILQ_FOREACH(psrv, &prison_services, ps_next) { psrv->ps_create(psrv, pr); } sx_sunlock(&allprison_lock); error = jail_attach(td, &jaa); if (error) goto e_dropprref; mtx_lock(&pr->pr_mtx); pr->pr_ref--; mtx_unlock(&pr->pr_mtx); td->td_retval[0] = jaa.jid; return (0); e_dropprref: sx_xlock(&allprison_lock); LIST_REMOVE(pr, pr_list); prisoncount--; sx_downgrade(&allprison_lock); TAILQ_FOREACH(psrv, &prison_services, ps_next) { psrv->ps_destroy(psrv, pr); } sx_sunlock(&allprison_lock); e_dropvnref: vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount); vrele(pr->pr_root); VFS_UNLOCK_GIANT(vfslocked); e_killmtx: mtx_destroy(&pr->pr_mtx); FREE(pr, M_PRISON); return (error); } /* * struct jail_attach_args { * int jid; * }; */ int jail_attach(struct thread *td, struct jail_attach_args *uap) { struct proc *p; struct ucred *newcred, *oldcred; struct prison *pr; int vfslocked, error; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); p = td->td_proc; sx_slock(&allprison_lock); pr = prison_find(uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } pr->pr_ref++; mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount); - vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root, 0, td); change_root(pr->pr_root, td); VFS_UNLOCK_GIANT(vfslocked); newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; setsugid(p); crcopy(newcred, oldcred); newcred->cr_prison = pr; p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0, td); VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&pr->pr_mtx); pr->pr_ref--; mtx_unlock(&pr->pr_mtx); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); LIST_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref == 0) { mtx_unlock(&pr->pr_mtx); break; } return (pr); } } return (NULL); } void prison_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); pr->pr_ref--; if (pr->pr_ref == 0) { mtx_unlock(&pr->pr_mtx); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; } mtx_unlock(&pr->pr_mtx); } static void prison_complete(void *context, int pending) { struct prison_service *psrv; struct prison *pr; int vfslocked; pr = (struct prison *)context; sx_xlock(&allprison_lock); LIST_REMOVE(pr, pr_list); prisoncount--; sx_downgrade(&allprison_lock); TAILQ_FOREACH(psrv, &prison_services, ps_next) { psrv->ps_destroy(psrv, pr); } sx_sunlock(&allprison_lock); vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount); vrele(pr->pr_root); VFS_UNLOCK_GIANT(vfslocked); mtx_destroy(&pr->pr_mtx); if (pr->pr_linux != NULL) FREE(pr->pr_linux, M_PRISON); FREE(pr, M_PRISON); } void prison_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_ref > 0, ("Trying to hold dead prison (id=%d).", pr->pr_id)); pr->pr_ref++; mtx_unlock(&pr->pr_mtx); } u_int32_t prison_getip(struct ucred *cred) { return (cred->cr_prison->pr_ip); } int prison_ip(struct ucred *cred, int flag, u_int32_t *ip) { u_int32_t tmp; if (!jailed(cred)) return (0); if (flag) tmp = *ip; else tmp = ntohl(*ip); if (tmp == INADDR_ANY) { if (flag) *ip = cred->cr_prison->pr_ip; else *ip = htonl(cred->cr_prison->pr_ip); return (0); } if (tmp == INADDR_LOOPBACK) { if (flag) *ip = cred->cr_prison->pr_ip; else *ip = htonl(cred->cr_prison->pr_ip); return (0); } if (cred->cr_prison->pr_ip != tmp) return (1); return (0); } void prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip) { u_int32_t tmp; if (!jailed(cred)) return; if (flag) tmp = *ip; else tmp = ntohl(*ip); if (tmp == INADDR_LOOPBACK) { if (flag) *ip = cred->cr_prison->pr_ip; else *ip = htonl(cred->cr_prison->pr_ip); return; } return; } int prison_if(struct ucred *cred, struct sockaddr *sa) { struct sockaddr_in *sai; int ok; sai = (struct sockaddr_in *)sa; if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only) ok = 1; else if (sai->sin_family != AF_INET) ok = 0; else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr)) ok = 1; else ok = 0; return (ok); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { if (jailed(cred1)) { if (!jailed(cred2)) return (ESRCH); if (cred2->cr_prison != cred1->cr_prison) return (ESRCH); } return (0); } /* * Return 1 if the passed credential is in a jail, otherwise 0. */ int jailed(struct ucred *cred) { return (cred->cr_prison != NULL); } /* * Return the correct hostname for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { if (jailed(cred)) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_host, size); mtx_unlock(&cred->cr_prison->pr_mtx); } else strlcpy(buf, hostname, size); } /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; if (!jailed(cred) || jail_enforce_statfs == 0) return (0); pr = cred->cr_prison; if (pr->pr_root->v_mount == mp) return (0); if (jail_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; if (!jailed(cred) || jail_enforce_statfs == 0) return; pr = cred->cr_prison; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { if (!jailed(cred)) return (0); switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_LOOKUP: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (jail_chflags_allowed) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: if (jail_mount_allowed) return (0); else return (EPERM); /* * Allow jailed root to bind reserved ports and reuse in-use * ports. */ case PRIV_NETINET_RESERVEDPORT: case PRIV_NETINET_REUSEPORT: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (jail_allow_raw_sockets) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Register jail service. Provides 'create' and 'destroy' methods. * 'create' method will be called for every existing jail and all * jails in the future as they beeing created. * 'destroy' method will be called for every jail going away and * for all existing jails at the time of service deregistration. */ struct prison_service * prison_service_register(const char *name, prison_create_t create, prison_destroy_t destroy) { struct prison_service *psrv, *psrv2; struct prison *pr; int reallocate = 1, slotno = 0; void **slots, **oldslots; psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON, M_WAITOK | M_ZERO); psrv->ps_create = create; psrv->ps_destroy = destroy; strcpy(psrv->ps_name, name); /* * Grab the allprison_lock here, so we won't miss any jail * creation/destruction. */ sx_xlock(&allprison_lock); #ifdef INVARIANTS /* * Verify if service is not already registered. */ TAILQ_FOREACH(psrv2, &prison_services, ps_next) { KASSERT(strcmp(psrv2->ps_name, name) != 0, ("jail service %s already registered", name)); } #endif /* * Find free slot. When there is no existing free slot available, * allocate one at the end. */ TAILQ_FOREACH(psrv2, &prison_services, ps_next) { if (psrv2->ps_slotno != slotno) { KASSERT(slotno < psrv2->ps_slotno, ("Invalid slotno (slotno=%d >= ps_slotno=%d", slotno, psrv2->ps_slotno)); /* We found free slot. */ reallocate = 0; break; } slotno++; } psrv->ps_slotno = slotno; /* * Keep the list sorted by slot number. */ if (psrv2 != NULL) { KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0")); TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next); } else { KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0")); TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next); } prison_service_slots++; sx_downgrade(&allprison_lock); /* * Allocate memory for new slot if we didn't found empty one. * Do not use realloc(9), because pr_slots is protected with a mutex, * so we can't sleep. */ LIST_FOREACH(pr, &allprison, pr_list) { if (reallocate) { /* First allocate memory with M_WAITOK. */ slots = malloc(sizeof(*slots) * prison_service_slots, M_PRISON, M_WAITOK); /* Now grab the mutex and replace pr_slots. */ mtx_lock(&pr->pr_mtx); oldslots = pr->pr_slots; if (psrv->ps_slotno > 0) { bcopy(oldslots, slots, sizeof(*slots) * (prison_service_slots - 1)); } slots[psrv->ps_slotno] = NULL; pr->pr_slots = slots; mtx_unlock(&pr->pr_mtx); if (oldslots != NULL) free(oldslots, M_PRISON); } /* * Call 'create' method for each existing jail. */ psrv->ps_create(psrv, pr); } sx_sunlock(&allprison_lock); return (psrv); } void prison_service_deregister(struct prison_service *psrv) { struct prison *pr; void **slots, **oldslots; int last = 0; sx_xlock(&allprison_lock); if (TAILQ_LAST(&prison_services, prison_services_head) == psrv) last = 1; TAILQ_REMOVE(&prison_services, psrv, ps_next); prison_service_slots--; sx_downgrade(&allprison_lock); LIST_FOREACH(pr, &allprison, pr_list) { /* * Call 'destroy' method for every currently existing jail. */ psrv->ps_destroy(psrv, pr); /* * If this is the last slot, free the memory allocated for it. */ if (last) { if (prison_service_slots == 0) slots = NULL; else { slots = malloc(sizeof(*slots) * prison_service_slots, M_PRISON, M_WAITOK); } mtx_lock(&pr->pr_mtx); oldslots = pr->pr_slots; /* * We require setting slot to NULL after freeing it, * this way we can check for memory leaks here. */ KASSERT(oldslots[psrv->ps_slotno] == NULL, ("Slot %d (service %s, jailid=%d) still contains data?", psrv->ps_slotno, psrv->ps_name, pr->pr_id)); if (psrv->ps_slotno > 0) { bcopy(oldslots, slots, sizeof(*slots) * prison_service_slots); } pr->pr_slots = slots; mtx_unlock(&pr->pr_mtx); KASSERT(oldslots != NULL, ("oldslots == NULL")); free(oldslots, M_PRISON); } } sx_sunlock(&allprison_lock); free(psrv, M_PRISON); } /* * Function sets data for the given jail in slot assigned for the given * jail service. */ void prison_service_data_set(struct prison_service *psrv, struct prison *pr, void *data) { mtx_assert(&pr->pr_mtx, MA_OWNED); pr->pr_slots[psrv->ps_slotno] = data; } /* * Function clears slots assigned for the given jail service in the given * prison structure and returns current slot data. */ void * prison_service_data_del(struct prison_service *psrv, struct prison *pr) { void *data; mtx_assert(&pr->pr_mtx, MA_OWNED); data = pr->pr_slots[psrv->ps_slotno]; pr->pr_slots[psrv->ps_slotno] = NULL; return (data); } /* * Function returns current data from the slot assigned to the given jail * service for the given jail. */ void * prison_service_data_get(struct prison_service *psrv, struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); return (pr->pr_slots[psrv->ps_slotno]); } static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp, *sxp; struct prison *pr; int count, error; if (jailed(req->td->td_ucred)) return (0); sx_slock(&allprison_lock); if ((count = prisoncount) == 0) { sx_sunlock(&allprison_lock); return (0); } sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO); LIST_FOREACH(pr, &allprison, pr_list) { xp->pr_version = XPRISON_VERSION; xp->pr_id = pr->pr_id; xp->pr_ip = pr->pr_ip; strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path)); mtx_lock(&pr->pr_mtx); strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host)); mtx_unlock(&pr->pr_mtx); xp++; } sx_sunlock(&allprison_lock); error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count); free(sxp, M_TEMP); return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); Index: head/sys/kern/kern_ktrace.c =================================================================== --- head/sys/kern/kern_ktrace.c (revision 175201) +++ head/sys/kern/kern_ktrace.c (revision 175202) @@ -1,1011 +1,1011 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ktrace facility allows the tracing of certain key events in user space * processes, such as system calls, signal delivery, context switches, and * user generated events using utrace(2). It works by streaming event * records and data to a vnode associated with the process using the * ktrace(2) system call. In general, records can be written directly from * the context that generates the event. One important exception to this is * during a context switch, where sleeping is not permitted. To handle this * case, trace events are generated using in-kernel ktr_request records, and * then delivered to disk at a convenient moment -- either immediately, the * next traceable event, at system call return, or at process exit. * * When dealing with multiple threads or processes writing to the same event * log, ordering guarantees are weak: specifically, if an event has multiple * records (i.e., system call enter and return), they may be interlaced with * records from another event. Process and thread ID information is provided * in the record, and user applications can de-interlace events if required. */ static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE"); #ifdef KTRACE #ifndef KTRACE_REQUEST_POOL #define KTRACE_REQUEST_POOL 100 #endif struct ktr_request { struct ktr_header ktr_header; void *ktr_buffer; union { struct ktr_syscall ktr_syscall; struct ktr_sysret ktr_sysret; struct ktr_genio ktr_genio; struct ktr_psig ktr_psig; struct ktr_csw ktr_csw; } ktr_data; STAILQ_ENTRY(ktr_request) ktr_list; }; static int data_lengths[] = { 0, /* none */ offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */ sizeof(struct ktr_sysret), /* KTR_SYSRET */ 0, /* KTR_NAMEI */ sizeof(struct ktr_genio), /* KTR_GENIO */ sizeof(struct ktr_psig), /* KTR_PSIG */ sizeof(struct ktr_csw), /* KTR_CSW */ 0 /* KTR_USER */ }; static STAILQ_HEAD(, ktr_request) ktr_free; static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options"); static u_int ktr_requestpool = KTRACE_REQUEST_POOL; TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool); static u_int ktr_geniosize = PAGE_SIZE; TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize); SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize, 0, "Maximum size of genio event payload"); static int print_message = 1; struct mtx ktrace_mtx; static struct sx ktrace_sx; static void ktrace_init(void *dummy); static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS); static u_int ktrace_resize_pool(u_int newsize); static struct ktr_request *ktr_getrequest(int type); static void ktr_submitrequest(struct thread *td, struct ktr_request *req); static void ktr_freerequest(struct ktr_request *req); static void ktr_writerequest(struct thread *td, struct ktr_request *req); static int ktrcanset(struct thread *,struct proc *); static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *); static int ktrops(struct thread *,struct proc *,int,int,struct vnode *); /* * ktrace itself generates events, such as context switches, which we do not * wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine * whether or not it is in a region where tracing of events should be * suppressed. */ static void ktrace_enter(struct thread *td) { KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set")); td->td_pflags |= TDP_INKTRACE; } static void ktrace_exit(struct thread *td) { KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set")); td->td_pflags &= ~TDP_INKTRACE; } static void ktrace_assert(struct thread *td) { KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set")); } static void ktrace_init(void *dummy) { struct ktr_request *req; int i; mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET); sx_init(&ktrace_sx, "ktrace_sx"); STAILQ_INIT(&ktr_free); for (i = 0; i < ktr_requestpool; i++) { req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); } } SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL); static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS) { struct thread *td; u_int newsize, oldsize, wantsize; int error; /* Handle easy read-only case first to avoid warnings from GCC. */ if (!req->newptr) { mtx_lock(&ktrace_mtx); oldsize = ktr_requestpool; mtx_unlock(&ktrace_mtx); return (SYSCTL_OUT(req, &oldsize, sizeof(u_int))); } error = SYSCTL_IN(req, &wantsize, sizeof(u_int)); if (error) return (error); td = curthread; ktrace_enter(td); mtx_lock(&ktrace_mtx); oldsize = ktr_requestpool; newsize = ktrace_resize_pool(wantsize); mtx_unlock(&ktrace_mtx); ktrace_exit(td); error = SYSCTL_OUT(req, &oldsize, sizeof(u_int)); if (error) return (error); if (wantsize > oldsize && newsize < wantsize) return (ENOSPC); return (0); } SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW, &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", ""); static u_int ktrace_resize_pool(u_int newsize) { struct ktr_request *req; int bound; mtx_assert(&ktrace_mtx, MA_OWNED); print_message = 1; bound = newsize - ktr_requestpool; if (bound == 0) return (ktr_requestpool); if (bound < 0) /* Shrink pool down to newsize if possible. */ while (bound++ < 0) { req = STAILQ_FIRST(&ktr_free); if (req == NULL) return (ktr_requestpool); STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); ktr_requestpool--; mtx_unlock(&ktrace_mtx); free(req, M_KTRACE); mtx_lock(&ktrace_mtx); } else /* Grow pool up to newsize. */ while (bound-- > 0) { mtx_unlock(&ktrace_mtx); req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK); mtx_lock(&ktrace_mtx); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); ktr_requestpool++; } return (ktr_requestpool); } static struct ktr_request * ktr_getrequest(int type) { struct ktr_request *req; struct thread *td = curthread; struct proc *p = td->td_proc; int pm; ktrace_enter(td); /* XXX: In caller instead? */ mtx_lock(&ktrace_mtx); if (!KTRCHECK(td, type)) { mtx_unlock(&ktrace_mtx); ktrace_exit(td); return (NULL); } req = STAILQ_FIRST(&ktr_free); if (req != NULL) { STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); req->ktr_header.ktr_type = type; if (p->p_traceflag & KTRFAC_DROP) { req->ktr_header.ktr_type |= KTR_DROP; p->p_traceflag &= ~KTRFAC_DROP; } mtx_unlock(&ktrace_mtx); microtime(&req->ktr_header.ktr_time); req->ktr_header.ktr_pid = p->p_pid; req->ktr_header.ktr_tid = td->td_tid; bcopy(td->td_name, req->ktr_header.ktr_comm, MAXCOMLEN + 1); req->ktr_buffer = NULL; req->ktr_header.ktr_len = 0; } else { p->p_traceflag |= KTRFAC_DROP; pm = print_message; print_message = 0; mtx_unlock(&ktrace_mtx); if (pm) printf("Out of ktrace request objects.\n"); ktrace_exit(td); } return (req); } /* * Some trace generation environments don't permit direct access to VFS, * such as during a context switch where sleeping is not allowed. Under these * circumstances, queue a request to the thread to be written asynchronously * later. */ static void ktr_enqueuerequest(struct thread *td, struct ktr_request *req) { mtx_lock(&ktrace_mtx); STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list); mtx_unlock(&ktrace_mtx); ktrace_exit(td); } /* * Drain any pending ktrace records from the per-thread queue to disk. This * is used both internally before committing other records, and also on * system call return. We drain all the ones we can find at the time when * drain is requested, but don't keep draining after that as those events * may me approximately "after" the current event. */ static void ktr_drain(struct thread *td) { struct ktr_request *queued_req; STAILQ_HEAD(, ktr_request) local_queue; ktrace_assert(td); sx_assert(&ktrace_sx, SX_XLOCKED); STAILQ_INIT(&local_queue); /* XXXRW: needed? */ if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) { mtx_lock(&ktrace_mtx); STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr); mtx_unlock(&ktrace_mtx); while ((queued_req = STAILQ_FIRST(&local_queue))) { STAILQ_REMOVE_HEAD(&local_queue, ktr_list); ktr_writerequest(td, queued_req); ktr_freerequest(queued_req); } } } /* * Submit a trace record for immediate commit to disk -- to be used only * where entering VFS is OK. First drain any pending records that may have * been cached in the thread. */ static void ktr_submitrequest(struct thread *td, struct ktr_request *req) { ktrace_assert(td); sx_xlock(&ktrace_sx); ktr_drain(td); ktr_writerequest(td, req); ktr_freerequest(req); sx_xunlock(&ktrace_sx); ktrace_exit(td); } static void ktr_freerequest(struct ktr_request *req) { if (req->ktr_buffer != NULL) free(req->ktr_buffer, M_KTRACE); mtx_lock(&ktrace_mtx); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); mtx_unlock(&ktrace_mtx); } void ktrsyscall(code, narg, args) int code, narg; register_t args[]; { struct ktr_request *req; struct ktr_syscall *ktp; size_t buflen; char *buf = NULL; buflen = sizeof(register_t) * narg; if (buflen > 0) { buf = malloc(buflen, M_KTRACE, M_WAITOK); bcopy(args, buf, buflen); } req = ktr_getrequest(KTR_SYSCALL); if (req == NULL) { if (buf != NULL) free(buf, M_KTRACE); return; } ktp = &req->ktr_data.ktr_syscall; ktp->ktr_code = code; ktp->ktr_narg = narg; if (buflen > 0) { req->ktr_header.ktr_len = buflen; req->ktr_buffer = buf; } ktr_submitrequest(curthread, req); } void ktrsysret(code, error, retval) int code, error; register_t retval; { struct ktr_request *req; struct ktr_sysret *ktp; req = ktr_getrequest(KTR_SYSRET); if (req == NULL) return; ktp = &req->ktr_data.ktr_sysret; ktp->ktr_code = code; ktp->ktr_error = error; ktp->ktr_retval = retval; /* what about val2 ? */ ktr_submitrequest(curthread, req); } /* * When a process exits, drain per-process asynchronous trace records. */ void ktrprocexit(struct thread *td) { ktrace_enter(td); sx_xlock(&ktrace_sx); ktr_drain(td); sx_xunlock(&ktrace_sx); ktrace_exit(td); } /* * When a thread returns, drain any asynchronous records generated by the * system call. */ void ktruserret(struct thread *td) { ktrace_enter(td); sx_xlock(&ktrace_sx); ktr_drain(td); sx_xunlock(&ktrace_sx); ktrace_exit(td); } void ktrnamei(path) char *path; { struct ktr_request *req; int namelen; char *buf = NULL; namelen = strlen(path); if (namelen > 0) { buf = malloc(namelen, M_KTRACE, M_WAITOK); bcopy(path, buf, namelen); } req = ktr_getrequest(KTR_NAMEI); if (req == NULL) { if (buf != NULL) free(buf, M_KTRACE); return; } if (namelen > 0) { req->ktr_header.ktr_len = namelen; req->ktr_buffer = buf; } ktr_submitrequest(curthread, req); } void ktrgenio(fd, rw, uio, error) int fd; enum uio_rw rw; struct uio *uio; int error; { struct ktr_request *req; struct ktr_genio *ktg; int datalen; char *buf; if (error) { free(uio, M_IOV); return; } uio->uio_offset = 0; uio->uio_rw = UIO_WRITE; datalen = imin(uio->uio_resid, ktr_geniosize); buf = malloc(datalen, M_KTRACE, M_WAITOK); error = uiomove(buf, datalen, uio); free(uio, M_IOV); if (error) { free(buf, M_KTRACE); return; } req = ktr_getrequest(KTR_GENIO); if (req == NULL) { free(buf, M_KTRACE); return; } ktg = &req->ktr_data.ktr_genio; ktg->ktr_fd = fd; ktg->ktr_rw = rw; req->ktr_header.ktr_len = datalen; req->ktr_buffer = buf; ktr_submitrequest(curthread, req); } void ktrpsig(sig, action, mask, code) int sig; sig_t action; sigset_t *mask; int code; { struct ktr_request *req; struct ktr_psig *kp; req = ktr_getrequest(KTR_PSIG); if (req == NULL) return; kp = &req->ktr_data.ktr_psig; kp->signo = (char)sig; kp->action = action; kp->mask = *mask; kp->code = code; ktr_enqueuerequest(curthread, req); } void ktrcsw(out, user) int out, user; { struct ktr_request *req; struct ktr_csw *kc; req = ktr_getrequest(KTR_CSW); if (req == NULL) return; kc = &req->ktr_data.ktr_csw; kc->out = out; kc->user = user; ktr_enqueuerequest(curthread, req); } #endif /* KTRACE */ /* Interface and common routines */ #ifndef _SYS_SYSPROTO_H_ struct ktrace_args { char *fname; int ops; int facs; int pid; }; #endif /* ARGSUSED */ int ktrace(td, uap) struct thread *td; register struct ktrace_args *uap; { #ifdef KTRACE register struct vnode *vp = NULL; register struct proc *p; struct pgrp *pg; int facs = uap->facs & ~KTRFAC_ROOT; int ops = KTROP(uap->ops); int descend = uap->ops & KTRFLAG_DESCEND; int nfound, ret = 0; int flags, error = 0, vfslocked; struct nameidata nd; struct ucred *cred; /* * Need something to (un)trace. */ if (ops != KTROP_CLEARFILE && facs == 0) return (EINVAL); ktrace_enter(td); if (ops != KTROP_CLEAR) { /* * an operation which requires a file argument. */ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE, uap->fname, td); flags = FREAD | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, 0, NULL); if (error) { ktrace_exit(td); return (error); } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); ktrace_exit(td); return (EACCES); } VFS_UNLOCK_GIANT(vfslocked); } /* * Clear all uses of the tracefile. */ if (ops == KTROP_CLEARFILE) { int vrele_count; vrele_count = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_tracevp == vp) { if (ktrcanset(td, p)) { mtx_lock(&ktrace_mtx); cred = p->p_tracecred; p->p_tracecred = NULL; p->p_tracevp = NULL; p->p_traceflag = 0; mtx_unlock(&ktrace_mtx); vrele_count++; crfree(cred); } else error = EPERM; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); if (vrele_count > 0) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); while (vrele_count-- > 0) vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } goto done; } /* * do it */ sx_slock(&proctree_lock); if (uap->pid < 0) { /* * by process group */ pg = pgfind(-uap->pid); if (pg == NULL) { sx_sunlock(&proctree_lock); error = ESRCH; goto done; } /* * ktrops() may call vrele(). Lock pg_members * by the proctree_lock rather than pg_mtx. */ PGRP_UNLOCK(pg); nfound = 0; LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p_cansee(td, p) != 0) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); nfound++; if (descend) ret |= ktrsetchildren(td, p, ops, facs, vp); else ret |= ktrops(td, p, ops, facs, vp); } if (nfound == 0) { sx_sunlock(&proctree_lock); error = ESRCH; goto done; } } else { /* * by pid */ p = pfind(uap->pid); if (p == NULL) { sx_sunlock(&proctree_lock); error = ESRCH; goto done; } error = p_cansee(td, p); /* * The slock of the proctree lock will keep this process * from going away, so unlocking the proc here is ok. */ PROC_UNLOCK(p); if (error) { sx_sunlock(&proctree_lock); goto done; } if (descend) ret |= ktrsetchildren(td, p, ops, facs, vp); else ret |= ktrops(td, p, ops, facs, vp); } sx_sunlock(&proctree_lock); if (!ret) error = EPERM; done: if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); (void) vn_close(vp, FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); } ktrace_exit(td); return (error); #else /* !KTRACE */ return (ENOSYS); #endif /* KTRACE */ } /* ARGSUSED */ int utrace(td, uap) struct thread *td; register struct utrace_args *uap; { #ifdef KTRACE struct ktr_request *req; void *cp; int error; if (!KTRPOINT(td, KTR_USER)) return (0); if (uap->len > KTR_USER_MAXLEN) return (EINVAL); cp = malloc(uap->len, M_KTRACE, M_WAITOK); error = copyin(uap->addr, cp, uap->len); if (error) { free(cp, M_KTRACE); return (error); } req = ktr_getrequest(KTR_USER); if (req == NULL) { free(cp, M_KTRACE); return (ENOMEM); } req->ktr_buffer = cp; req->ktr_header.ktr_len = uap->len; ktr_submitrequest(td, req); return (0); #else /* !KTRACE */ return (ENOSYS); #endif /* KTRACE */ } #ifdef KTRACE static int ktrops(td, p, ops, facs, vp) struct thread *td; struct proc *p; int ops, facs; struct vnode *vp; { struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; PROC_LOCK(p); if (!ktrcanset(td, p)) { PROC_UNLOCK(p); return (0); } mtx_lock(&ktrace_mtx); if (ops == KTROP_SET) { if (p->p_tracevp != vp) { /* * if trace file already in use, relinquish below */ tracevp = p->p_tracevp; VREF(vp); p->p_tracevp = vp; } if (p->p_tracecred != td->td_ucred) { tracecred = p->p_tracecred; p->p_tracecred = crhold(td->td_ucred); } p->p_traceflag |= facs; if (priv_check(td, PRIV_KTRACE) == 0) p->p_traceflag |= KTRFAC_ROOT; } else { /* KTROP_CLEAR */ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { /* no more tracing */ p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; } } mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); if (tracevp != NULL) { int vfslocked; vfslocked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(vfslocked); } if (tracecred != NULL) crfree(tracecred); return (1); } static int ktrsetchildren(td, top, ops, facs, vp) struct thread *td; struct proc *top; int ops, facs; struct vnode *vp; { register struct proc *p; register int ret = 0; p = top; sx_assert(&proctree_lock, SX_LOCKED); for (;;) { ret |= ktrops(td, p, ops, facs, vp); /* * If this process has children, descend to them next, * otherwise do any siblings, and if done with this level, * follow back up the tree (but not past top). */ if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) return (ret); if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } /*NOTREACHED*/ } static void ktr_writerequest(struct thread *td, struct ktr_request *req) { struct ktr_header *kth; struct vnode *vp; struct proc *p; struct ucred *cred; struct uio auio; struct iovec aiov[3]; struct mount *mp; int datalen, buflen, vrele_count; int error, vfslocked; /* * We hold the vnode and credential for use in I/O in case ktrace is * disabled on the process as we write out the request. * * XXXRW: This is not ideal: we could end up performing a write after * the vnode has been closed. */ mtx_lock(&ktrace_mtx); vp = td->td_proc->p_tracevp; if (vp != NULL) VREF(vp); cred = td->td_proc->p_tracecred; if (cred != NULL) crhold(cred); mtx_unlock(&ktrace_mtx); /* * If vp is NULL, the vp has been cleared out from under this * request, so just drop it. Make sure the credential and vnode are * in sync: we should have both or neither. */ if (vp == NULL) { KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL")); return; } KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL")); kth = &req->ktr_header; datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP]; buflen = kth->ktr_len; auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; aiov[0].iov_base = (caddr_t)kth; aiov[0].iov_len = sizeof(struct ktr_header); auio.uio_resid = sizeof(struct ktr_header); auio.uio_iovcnt = 1; auio.uio_td = td; if (datalen != 0) { aiov[1].iov_base = (caddr_t)&req->ktr_data; aiov[1].iov_len = datalen; auio.uio_resid += datalen; auio.uio_iovcnt++; kth->ktr_len += datalen; } if (buflen != 0) { KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write")); aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer; aiov[auio.uio_iovcnt].iov_len = buflen; auio.uio_resid += buflen; auio.uio_iovcnt++; } vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void)VOP_LEASE(vp, td, cred, LEASE_WRITE); #ifdef MAC error = mac_vnode_check_write(cred, NOCRED, vp); if (error == 0) #endif error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); if (!error) return; /* * If error encountered, give up tracing on this vnode. We defer * all the vrele()'s on the vnode until after we are finished walking * the various lists to avoid needlessly holding locks. */ log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", error); vrele_count = 0; /* * First, clear this vnode from being used by any processes in the * system. * XXX - If one process gets an EPERM writing to the vnode, should * we really do this? Other processes might have suitable * credentials for the operation. */ cred = NULL; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_tracevp == vp) { mtx_lock(&ktrace_mtx); p->p_tracevp = NULL; p->p_traceflag = 0; cred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); vrele_count++; } PROC_UNLOCK(p); if (cred != NULL) { crfree(cred); cred = NULL; } } sx_sunlock(&allproc_lock); /* * We can't clear any pending requests in threads that have cached * them but not yet committed them, as those are per-thread. The * thread will have to clear it itself on system call return. */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); while (vrele_count-- > 0) vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } /* * Return true if caller has permission to set the ktracing state * of target. Essentially, the target can't possess any * more permissions than the caller. KTRFAC_ROOT signifies that * root previously set the tracing status on the target process, and * so, only root may further change it. */ static int ktrcanset(td, targetp) struct thread *td; struct proc *targetp; { PROC_LOCK_ASSERT(targetp, MA_OWNED); if (targetp->p_traceflag & KTRFAC_ROOT && priv_check(td, PRIV_KTRACE)) return (0); if (p_candebug(td, targetp) != 0) return (0); return (1); } #endif /* KTRACE */ Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 175201) +++ head/sys/kern/kern_proc.c (revision 175202) @@ -1,1628 +1,1627 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #ifdef DDB #include #endif #include #include #include #include #include #include MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); static int proc_ctor(void *mem, int size, void *arg, int flags); static void proc_dtor(void *mem, int size, void *arg); static int proc_init(void *mem, int size, int flags); static void proc_fini(void *mem, int size); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; struct sx proctree_lock; struct mtx ppeers_lock; uma_zone_t proc_zone; uma_zone_t ithread_zone; int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, ""); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); /* * Initialize global process hashing structures. */ void procinit() { sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static int proc_ctor(void *mem, int size, void *arg, int flags) { struct proc *p; p = (struct proc *)mem; EVENTHANDLER_INVOKE(process_ctor, p); return (0); } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; /* INVARIANTS checks go here */ p = (struct proc *)mem; td = FIRST_THREAD_IN_PROC(p); if (td != NULL) { #ifdef INVARIANTS KASSERT((p->p_numthreads == 1), ("bad number of threads in exiting process")); KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif /* Dispose of an alternate kstack, if it exists. * XXX What if there are more than one thread in the proc? * The first thread in the proc is special and not * freed, so you gotta do this here. */ if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0)) vm_thread_dispose_altkstack(td); } EVENTHANDLER_INVOKE(process_dtor, p); if (p->p_ksi != NULL) KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue")); } /* * Initialize type-stable parts of a proc (when newly created). */ static int proc_init(void *mem, int size, int flags) { struct proc *p; p = (struct proc *)mem; p->p_sched = (struct p_sched *)&p[1]; bzero(&p->p_mtx, sizeof(struct mtx)); mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); TAILQ_INIT(&p->p_threads); /* all threads in proc */ EVENTHANDLER_INVOKE(process_init, p); p->p_stats = pstats_alloc(); return (0); } /* * UMA should ensure that this function is never called. * Freeing a proc structure would violate type stability. */ static void proc_fini(void *mem, int size) { #ifdef notnow struct proc *p; p = (struct proc *)mem; EVENTHANDLER_INVOKE(process_fini, p); pstats_free(p->p_stats); thread_free(FIRST_THREAD_IN_PROC(p)); mtx_destroy(&p->p_mtx); if (p->p_ksi != NULL) ksiginfo_free(p->p_ksi); #else panic("proc reclaimed"); #endif } /* * Is p an inferior of the current process? */ int inferior(p) register struct proc *p; { sx_assert(&proctree_lock, SX_LOCKED); for (; p != curproc; p = p->p_pptr) if (p->p_pid == 0) return (0); return (1); } /* * Locate a process by number; return only "live" processes -- i.e., neither * zombies nor newly born but incompletely initialized processes. By not * returning processes in the PRS_NEW state, we allow callers to avoid * testing for that condition to avoid dereferencing p_ucred, et al. */ struct proc * pfind(pid) register pid_t pid; { register struct proc *p; sx_slock(&allproc_lock); LIST_FOREACH(p, PIDHASH(pid), p_hash) if (p->p_pid == pid) { if (p->p_state == PRS_NEW) { p = NULL; break; } PROC_LOCK(p); break; } sx_sunlock(&allproc_lock); return (p); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(p, pgid, pgrp, sess) register struct proc *p; pid_t pgid; struct pgrp *pgrp; struct session *sess; { struct pgrp *pgrp2; sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); pgrp2 = pgfind(pgid); KASSERT(pgrp2 == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); mtx_lock(&Giant); /* XXX TTY */ PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; sess->s_count = 1; sess->s_ttyvp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { mtx_lock(&Giant); /* XXX TTY */ pgrp->pg_session = p->p_session; SESS_LOCK(pgrp->pg_session); pgrp->pg_session->s_count++; SESS_UNLOCK(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); mtx_unlock(&Giant); /* XXX TTY */ doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(p, pgrp) register struct proc *p; struct pgrp *pgrp; { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p.\n", __func__, pgrp->pg_session, p->p_session)); KASSERT(pgrp != p->p_pgrp, ("%s: p belongs to pgrp.", __func__)); doenterpgrp(p, pgrp); return (0); } /* * Move p to a process group */ static void doenterpgrp(p, pgrp) struct proc *p; struct pgrp *pgrp; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); mtx_lock(&Giant); /* XXX TTY */ PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); mtx_unlock(&Giant); /* XXX TTY */ if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; mtx_lock(&Giant); /* XXX TTY */ PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); mtx_unlock(&Giant); /* XXX TTY */ if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { struct session *savesess; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); mtx_lock(&Giant); /* XXX TTY */ PGRP_LOCK(pgrp); if (pgrp->pg_session->s_ttyp != NULL && pgrp->pg_session->s_ttyp->t_pgrp == pgrp) pgrp->pg_session->s_ttyp->t_pgrp = NULL; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; SESSRELE(savesess); PGRP_UNLOCK(pgrp); mtx_destroy(&pgrp->pg_mtx); FREE(pgrp, M_PGRP); mtx_unlock(&Giant); /* XXX TTY */ } static void pgadjustjobc(pgrp, entering) struct pgrp *pgrp; int entering; { PGRP_LOCK(pgrp); if (entering) pgrp->pg_jobc++; else { --pgrp->pg_jobc; if (pgrp->pg_jobc == 0) orphanpg(pgrp); } PGRP_UNLOCK(pgrp); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(p, pgrp, entering) register struct proc *p; register struct pgrp *pgrp; int entering; { register struct pgrp *hispgrp; register struct session *mysession; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ mysession = pgrp->pg_session; if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) pgadjustjobc(pgrp, entering); /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(p, &p->p_children, p_sibling) { hispgrp = p->p_pgrp; if (hispgrp == pgrp || hispgrp->pg_session != mysession) continue; PROC_LOCK(p); if (p->p_state == PRS_ZOMBIE) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); pgadjustjobc(hispgrp, entering); } } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p)) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); psignal(p, SIGHUP); psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } void sessrele(struct session *s) { int i; SESS_LOCK(s); i = --s->s_count; SESS_UNLOCK(s); if (i == 0) { if (s->s_ttyp != NULL) ttyrel(s->s_ttyp); mtx_destroy(&s->s_mtx); FREE(s, M_SESSION); } } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ /* * Clear kinfo_proc and fill in any information that is common * to all threads in the process. * Must be called with the target process locked. */ static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) { struct thread *td0; struct tty *tp; struct session *sp; struct ucred *cred; struct sigacts *ps; bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = p->p_tracevp; mtx_lock(&ktrace_mtx); kp->ki_traceflag = p->p_traceflag; mtx_unlock(&ktrace_mtx); #endif kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; kp->ki_flag = p->p_flag; cred = p->p_ucred; if (cred) { kp->ki_uid = cred->cr_uid; kp->ki_ruid = cred->cr_ruid; kp->ki_svuid = cred->cr_svuid; /* XXX bde doesn't like KI_NGROUPS */ kp->ki_ngroups = min(cred->cr_ngroups, KI_NGROUPS); bcopy(cred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; /* If inside a jail, use 0 as a jail ID. */ if (!jailed(curthread->td_ucred)) kp->ki_jid = cred->cr_prison->pr_id; } } ps = p->p_sigacts; if (ps) { mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } PROC_SLOCK(p); if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; if (td0->td_altkstack_obj != NULL) kp->ki_rssize += td0->td_altkstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } else if (p->p_state == PRS_ZOMBIE) kp->ki_stat = SZOMB; if (kp->ki_flag & P_INMEM) kp->ki_sflag = PS_INMEM; else kp->ki_sflag = 0; /* Calculate legacy swtime as seconds since 'swtick'. */ kp->ki_swtime = (ticks - p->p_swtick) / hz; kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); PROC_SUNLOCK(p); if ((p->p_flag & P_INMEM) && p->p_stats != NULL) { kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); PROC_SLOCK(p); calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime); PROC_SUNLOCK(p); calccru(p, &kp->ki_childutime, &kp->ki_childstime); /* Some callers want child-times in a single value */ kp->ki_childtime = kp->ki_childstime; timevaladd(&kp->ki_childtime, &kp->ki_childutime); } tp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; tp = sp->s_ttyp; SESS_UNLOCK(sp); } } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = dev2udev(tp->t_dev); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NODEV; if (p->p_comm[0] != '\0') strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); if (p->p_sysent && p->p_sysent->sv_name != NULL && p->p_sysent->sv_name[0] != '\0') strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul)); kp->ki_siglist = p->p_siglist; kp->ki_xstat = p->p_xstat; kp->ki_acflag = p->p_acflag; kp->ki_lock = p->p_lock; if (p->p_pptr) kp->ki_ppid = p->p_pptr->p_pid; } /* * Fill in information that is thread specific. * Must be called with p_slock locked. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg)); if (td->td_name[0] != '\0') strlcpy(kp->ki_ocomm, td->td_name, sizeof(kp->ki_ocomm)); if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } else { kp->ki_kiflag &= ~KI_LOCKBLOCK; bzero(kp->ki_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* XXXKSE very approximate */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else if (p->p_state == PRS_ZOMBIE) { kp->ki_stat = SZOMB; } else { kp->ki_stat = SIDL; } /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_tid = td->td_tid; kp->ki_numthreads = p->p_numthreads; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_pctcpu = sched_pctcpu(td); kp->ki_estcpu = td->td_estcpu; kp->ki_slptime = (ticks - td->td_slptick) / hz; kp->ki_pri.pri_class = td->td_pri_class; kp->ki_pri.pri_user = td->td_user_pri; /* We can't get this anymore but ps etc never used it anyway. */ kp->ki_rqindex = 0; SIGSETOR(kp->ki_siglist, td->td_siglist); kp->ki_sigmask = td->td_sigmask; thread_unlock(td); } /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { fill_kinfo_proc_only(p, kp); PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp); PROC_SUNLOCK(p); } struct pstats * pstats_alloc(void) { return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK)); } /* * Copy parts of p_stats; zero the rest of p_stats (statistics). */ void pstats_fork(struct pstats *src, struct pstats *dst) { bzero(&dst->pstat_startzero, __rangeof(struct pstats, pstat_startzero, pstat_endzero)); bcopy(&src->pstat_startcopy, &dst->pstat_startcopy, __rangeof(struct pstats, pstat_startcopy, pstat_endcopy)); } void pstats_free(struct pstats *ps) { free(ps, M_SUBPROC); } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); LIST_FOREACH(p, &zombproc, p_list) if (p->p_pid == pid) { PROC_LOCK(p); break; } sx_sunlock(&allproc_lock); return (p); } #define KERN_PROC_ZOMBMASK 0x3 #define KERN_PROC_NOTHREADS 0x4 /* * Must be called with the process locked and will return with it unlocked. */ static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags) { struct thread *td; struct kinfo_proc kinfo_proc; int error = 0; struct proc *np; pid_t pid = p->p_pid; PROC_LOCK_ASSERT(p, MA_OWNED); fill_kinfo_proc_only(p, &kinfo_proc); if (flags & KERN_PROC_NOTHREADS) { PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc); PROC_SUNLOCK(p); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); } else { PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &kinfo_proc); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); if (error) break; } else error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); PROC_SUNLOCK(p); } PROC_UNLOCK(p); if (error) return (error); if (flags & KERN_PROC_ZOMBMASK) np = zpfind(pid); else { if (pid == 0) return (0); np = pfind(pid); } if (np == NULL) return EAGAIN; if (np != p) { PROC_UNLOCK(np); return EAGAIN; } PROC_UNLOCK(np); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct proc *p; int flags, doingzomb, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); p = pfind((pid_t)name[0]); if (!p) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } error = sysctl_out_proc(p, req, flags); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_slock(&allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); for (; p != 0; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. */ PROC_SLOCK(p); if (p->p_state == PRS_NEW) { PROC_SUNLOCK(p); continue; } PROC_SUNLOCK(p); PROC_LOCK(p); KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) { PROC_UNLOCK(p); continue; } /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_GID: if (p->p_ucred->cr_gid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RGID: if (p->p_ucred->cr_rgid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) { PROC_UNLOCK(p); continue; } SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || dev2udev(p->p_session->s_ttyp->t_dev) != (dev_t)name[0]) { SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); continue; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred->cr_uid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RUID: if (p->p_ucred->cr_ruid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags | doingzomb); if (error) { sx_sunlock(&allproc_lock); return (error); } } } sx_sunlock(&allproc_lock); return (0); } struct pargs * pargs_alloc(int len) { struct pargs *pa; MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS, M_WAITOK); refcount_init(&pa->ar_ref, 1); pa->ar_length = len; return (pa); } void pargs_free(struct pargs *pa) { FREE(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; refcount_acquire(&pa->ar_ref); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; if (refcount_release(&pa->ar_ref)) pargs_free(pa); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; int error = 0; if (namelen != 1) return (EINVAL); p = pfind((pid_t)name[0]); if (!p) return (ESRCH); if ((error = p_cansee(curthread, p)) != 0) { PROC_UNLOCK(p); return (error); } if (req->newptr && curproc != p) { PROC_UNLOCK(p); return (EPERM); } pa = p->p_args; pargs_hold(pa); PROC_UNLOCK(p); if (req->oldptr != NULL && pa != NULL) error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); if (error != 0 || req->newptr == NULL) return (error); if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (ENOMEM); newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } /* * This sysctl allows a process to retrieve the path of the executable for * itself or another process. */ static int sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct vnode *vp; char *retbuf, *freebuf; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; } else { p = pfind(*pidp); if (p == NULL) return (ESRCH); if ((error = p_cansee(curthread, p)) != 0) { PROC_UNLOCK(p); return (error); } } vp = p->p_textvp; if (vp == NULL) { if (*pidp != -1) PROC_UNLOCK(p); return (0); } vref(vp); if (*pidp != -1) PROC_UNLOCK(p); error = vn_fullpath(req->td, vp, &retbuf, &freebuf); vrele(vp); if (error) return (error); error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1); free(freebuf, M_TEMP); return (error); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } static int sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS) { vm_map_entry_t entry, tmp_entry; unsigned int last_timestamp; char *fullpath, *freepath; struct kinfo_vmentry *kve; int error, *name; struct vnode *vp; struct proc *p; vm_map_t map; name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) { PROC_UNLOCK(p); return (ESRCH); } if ((error = p_candebug(curthread, p))) { PROC_UNLOCK(p); return (error); } _PHOLD(p); PROC_UNLOCK(p); kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK); map = &p->p_vmspace->vm_map; /* XXXRW: More locking required? */ vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj, tobj, lobj; vm_offset_t addr; int vfslocked; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; bzero(kve, sizeof(*kve)); kve->kve_structsize = sizeof(*kve); kve->kve_private_resident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); if (obj->shadow_count == 1) kve->kve_private_resident = obj->resident_page_count; } kve->kve_resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) kve->kve_resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_LOCK(tobj); if (lobj != obj) VM_OBJECT_UNLOCK(lobj); lobj = tobj; } freepath = NULL; fullpath = ""; if (lobj) { vp = NULL; switch(lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: kve->kve_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_UNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_UNLOCK(obj); if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, - curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vn_fullpath(curthread, vp, &fullpath, &freepath); vput(vp); VFS_UNLOCK_GIANT(vfslocked); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } kve->kve_start = (void*)entry->start; kve->kve_end = (void*)entry->end; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); last_timestamp = map->timestamp; vm_map_unlock_read(map); error = SYSCTL_OUT(req, kve, sizeof(*kve)); vm_map_lock_read(map); if (error) break; if (last_timestamp + 1 != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); PRELE(p); free(kve, M_TEMP); return (error); } #if defined(STACK) || defined(DDB) static int sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS) { struct kinfo_kstack *kkstp; int error, i, *name, numthreads; lwpid_t *lwpidarray; struct thread *td; struct stack *st; struct sbuf sb; struct proc *p; name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); /* XXXRW: Not clear ESRCH is the right error during proc execve(). */ if (p->p_flag & P_WEXIT || p->p_flag & P_INEXEC) { PROC_UNLOCK(p); return (ESRCH); } if ((error = p_candebug(curthread, p))) { PROC_UNLOCK(p); return (error); } _PHOLD(p); PROC_UNLOCK(p); kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK); st = stack_create(); lwpidarray = NULL; numthreads = 0; PROC_SLOCK(p); repeat: if (numthreads < p->p_numthreads) { if (lwpidarray != NULL) { free(lwpidarray, M_TEMP); lwpidarray = NULL; } numthreads = p->p_numthreads; PROC_SUNLOCK(p); lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP, M_WAITOK | M_ZERO); PROC_SLOCK(p); goto repeat; } PROC_SUNLOCK(p); i = 0; /* * XXXRW: During the below loop, execve(2) and countless other sorts * of changes could have taken place. Should we check to see if the * vmspace has been replaced, or the like, in order to prevent * giving a snapshot that spans, say, execve(2), with some threads * before and some after? Among other things, the credentials could * have changed, in which case the right to extract debug info might * no longer be assured. */ PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td) { KASSERT(i < numthreads, ("sysctl_kern_proc_kstack: numthreads")); lwpidarray[i] = td->td_tid; i++; } numthreads = i; for (i = 0; i < numthreads; i++) { td = thread_find(p, lwpidarray[i]); if (td == NULL) { continue; } bzero(kkstp, sizeof(*kkstp)); (void)sbuf_new(&sb, kkstp->kkst_trace, sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN); thread_lock(td); kkstp->kkst_tid = td->td_tid; if (TD_IS_SWAPPED(td)) kkstp->kkst_state = KKST_STATE_SWAPPED; else if (TD_IS_RUNNING(td)) kkstp->kkst_state = KKST_STATE_RUNNING; else { kkstp->kkst_state = KKST_STATE_STACKOK; stack_save_td(st, td); } thread_unlock(td); PROC_UNLOCK(p); stack_sbuf_print(&sb, st); sbuf_finish(&sb); sbuf_delete(&sb); error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp)); PROC_LOCK(p); if (error) break; } _PRELE(p); PROC_UNLOCK(p); if (lwpidarray != NULL) free(lwpidarray, M_TEMP); stack_destroy(st); free(kkstp, M_TEMP); return (error); } #endif SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD, sysctl_kern_proc_pathname, "Process executable path"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD, sysctl_kern_proc_vmmap, "Process vm map entries"); #if defined(STACK) || defined(DDB) static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD, sysctl_kern_proc_kstack, "Process kernel stacks"); #endif Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 175201) +++ head/sys/kern/kern_sig.c (revision 175202) @@ -1,3327 +1,3327 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ static int coredump(struct thread *); static char *expand_name(const char *, uid_t, pid_t); static int killpg1(struct thread *td, int sig, int pgid, int all); static int issignal(struct thread *p); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static void sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); #ifdef KSE static int do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *); #endif static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { 0, filt_sigattach, filt_sigdetach, filt_signal }; int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo); SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "Enable coredumping set user/group ID processes"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x01 /* terminates process by default */ #define SA_CORE 0x02 /* ditto and coredumps */ #define SA_STOP 0x04 /* suspend process */ #define SA_TTYSTOP 0x08 /* ditto, from tty */ #define SA_IGNORE 0x10 /* ignore by default */ #define SA_CONT 0x20 /* continue if suspended */ #define SA_CANTMASK 0x40 /* non-maskable, catchable */ #define SA_PROC 0x80 /* deliverable to any thread */ static int sigproptbl[NSIG] = { SA_KILL|SA_PROC, /* SIGHUP */ SA_KILL|SA_PROC, /* SIGINT */ SA_KILL|SA_CORE|SA_PROC, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE|SA_PROC, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL|SA_PROC, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL|SA_PROC, /* SIGPIPE */ SA_KILL|SA_PROC, /* SIGALRM */ SA_KILL|SA_PROC, /* SIGTERM */ SA_IGNORE|SA_PROC, /* SIGURG */ SA_STOP|SA_PROC, /* SIGSTOP */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTSTP */ SA_IGNORE|SA_CONT|SA_PROC, /* SIGCONT */ SA_IGNORE|SA_PROC, /* SIGCHLD */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTIN */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTOU */ SA_IGNORE|SA_PROC, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL|SA_PROC, /* SIGVTALRM */ SA_KILL|SA_PROC, /* SIGPROF */ SA_IGNORE|SA_PROC, /* SIGWINCH */ SA_IGNORE|SA_PROC, /* SIGINFO */ SA_KILL|SA_PROC, /* SIGUSR1 */ SA_KILL|SA_PROC, /* SIGUSR2 */ }; static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_kill, signo)) { count++; SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if ((si->ksi_flags & KSI_TRAP) != 0) { if (ret != 0) SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } if (ret != 0) return (ret); out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); } void sigqueue_collect_set(sigqueue_t *sq, sigset_t *set) { ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link) SIGADDSET(*set, ksi->ksi_signo); SIGSETOR(*set, sq->sq_kill); } void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp) { sigset_t tmp, set; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); /* * make a copy, this allows setp to point to src or dst * sq_signals without trouble. */ set = *setp; p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_signals; SIGSETAND(tmp, set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); /* Finally, rescan src queue and set pending bits for it */ sigqueue_collect_set(src, &src->sq_signals); } void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } void sigqueue_delete_set(sigqueue_t *sq, sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_signals, *set); /* Finally, rescan queue and set pending bits for it */ sigqueue_collect_set(sq, &sq->sq_signals); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ void sigqueue_delete_set_proc(struct proc *p, sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); PROC_SUNLOCK(p); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to process p, the current * process, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { struct proc *p; #ifdef KSE sigset_t set, saved; #else sigset_t set; #endif p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If our mask changed we may have to move signal that were * previously masked by all threads to our sigqueue. */ set = p->p_sigqueue.sq_signals; #ifdef KSE if (p->p_flag & P_SA) saved = p->p_sigqueue.sq_signals; #endif SIGSETNAND(set, td->td_sigmask); if (! SIGISEMPTY(set)) sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } #ifdef KSE if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) { /* pending set changed */ p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } } #endif } int sigonstack(size_t sp) { struct thread *td = curthread; return ((td->td_pflags & TDP_ALTSTACK) ? #if defined(COMPAT_43) ((td->td_sigstk.ss_size == 0) ? (td->td_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)) #else ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(td, sig, act, oact, flags) struct thread *td; register int sig; struct sigaction *act, *oact; int flags; { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; oact->sa_flags = 0; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (act->sa_flags & SA_SIGINFO) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!(act->sa_flags & SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (act->sa_flags & SA_ONSTACK) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (act->sa_flags & SA_RESETHAND) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (act->sa_flags & SA_NODEFER) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SA_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { #ifdef KSE if ((p->p_flag & P_SA) && SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) { p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } #endif /* never to be seen again */ PROC_SLOCK(p); sigqueue_delete_proc(p, sig); PROC_SUNLOCK(p); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sigaction(td, uap) struct thread *td; register struct sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(td, uap) struct thread *td; register struct freebsd4_sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(td, uap) struct thread *td; register struct osigaction_args *uap; { struct osigaction sa; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(td, uap) struct thread *td; struct osigreturn_args *uap; { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(p) struct proc *p; { register int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) if (sigprop(i) & SA_IGNORE && i != SIGCONT) SIGADDSET(ps->ps_sigignore, i); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); SIGDELSET(ps->ps_sigcatch, sig); if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); PROC_SLOCK(p); sigqueue_delete_proc(p, sig); PROC_SUNLOCK(p); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(td, how, set, oset, old) struct thread *td; int how; sigset_t *set, *oset; int old; { int error; PROC_LOCK(td->td_proc); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); SIGSETOR(td->td_sigmask, *set); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); break; case SIG_SETMASK: SIG_CANTMASK(*set); if (old) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; signotify(td); break; default: error = EINVAL; break; } } PROC_UNLOCK(td->td_proc); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sigprocmask(td, uap) register struct thread *td; struct sigprocmask_args *uap; { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(td, uap) register struct thread *td; struct osigprocmask_args *uap; { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t savedmask; struct proc *p; int error, sig, hz, i, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; sig = 0; ets.tv_sec = 0; ets.tv_nsec = 0; SIG_CANTMASK(waitset); PROC_LOCK(p); ps = p->p_sigacts; savedmask = td->td_sigmask; if (timeout) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); ets = rts; timespecadd(&ets, timeout); } } restart: for (i = 1; i <= _SIG_MAXSIG; ++i) { if (!SIGISMEMBER(waitset, i)) continue; if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) { if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) { #ifdef KSE if (p->p_flag & P_SA) { p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } #endif sigqueue_move(&p->p_sigqueue, &td->td_sigqueue, i); } else continue; } SIGFILLSET(td->td_sigmask); SIG_CANTMASK(td->td_sigmask); SIGDELSET(td->td_sigmask, i); mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); if (sig) goto out; else { /* * Because cursig() may have stopped current thread, * after it is resumed, things may have already been * changed, it should rescan any pending signals. */ goto restart; } } if (error) goto out; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout) { if (!timevalid) { error = EINVAL; goto out; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; goto out; } ts = ets; timespecsub(&ts, &rts); TIMESPEC_TO_TIMEVAL(&tv, &ts); hz = tvtohz(&tv); } else hz = 0; td->td_sigmask = savedmask; SIGSETNAND(td->td_sigmask, waitset); signotify(td); error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz); if (timeout) { if (error == ERESTART) { /* timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* will calculate timeout by ourself. */ error = 0; } } goto restart; out: td->td_sigmask = savedmask; signotify(td); if (sig) { ksiginfo_init(ksi); sigqueue_get(&td->td_sigqueue, sig, ksi); ksi->ksi_signo = sig; if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); error = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, 0); } #endif if (sig == SIGKILL) sigexit(td, sig); } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sigpending(td, uap) struct thread *td; struct sigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(td, uap) struct thread *td; struct osigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(td, uap) struct thread *td; register struct osigvec_args *uap; { struct sigvec vec; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(td, uap) register struct thread *td; struct osigblock_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETOR(td->td_sigmask, set); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(td, uap) struct thread *td; struct osigsetmask_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETLO(td->td_sigmask, set); signotify(td); PROC_UNLOCK(p); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sigsuspend(td, uap) struct thread *td; struct sigsuspend_args *uap; { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; SIG_CANTMASK(mask); td->td_sigmask = mask; signotify(td); while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(td, uap) struct thread *td; struct osigsuspend_args *uap; { struct proc *p = td->td_proc; sigset_t mask; PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; OSIG2SIG(uap->mask, mask); SIG_CANTMASK(mask); SIGSETLO(td->td_sigmask, mask); signotify(td); while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(td, uap) struct thread *td; register struct osigstack_args *uap; { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sigaltstack(td, uap) struct thread *td; register struct sigaltstack_args *uap; { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(td, sig, pgid, all) register struct thread *td; int sig, pgid, all; { register struct proc *p; struct pgrp *pgrp; int nfound = 0; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW ) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (nfound ? 0 : ESRCH); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int kill(td, uap) register struct thread *td; register struct kill_args *uap; { register struct proc *p; int error; AUDIT_ARG(signum, uap->signum); AUDIT_ARG(pid, uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); if (uap->pid > 0) { /* kill single process */ if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } AUDIT_ARG(process, p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0)); } /* NOTREACHED */ } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(td, uap) struct thread *td; register struct okillpg_args *uap; { AUDIT_ARG(signum, uap->signum); AUDIT_ARG(pid, uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); return (killpg1(td, uap->signum, uap->pgid, 0)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sigqueue(struct thread *td, struct sigqueue_args *uap) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (uap->pid <= 0) return (EINVAL); if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum != 0) { ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value.sival_ptr = uap->value; error = tdsignal(p, NULL, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(pgid, sig) int pgid, sig; { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(pgrp, sig, checkctty) struct pgrp *pgrp; int sig, checkctty; { register struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (checkctty == 0 || p->p_flag & P_CONTROLT) psignal(p, sig); PROC_UNLOCK(p); } } } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; #ifdef KSE int error; #endif int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); #ifdef KSE if (td->td_pflags & TDP_SA) { if (td->td_mailbox == NULL) thread_user_enter(td); PROC_LOCK(p); SIGDELSET(td->td_sigmask, sig); thread_lock(td); /* * Force scheduling an upcall, so UTS has chance to * process the signal before thread runs again in * userland. */ if (td->td_upcall) td->td_upcall->ku_flags |= KUF_DOUPCALL; thread_unlock(td); } else { PROC_LOCK(p); } #else PROC_LOCK(p); #endif ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { td->td_ru.ru_nsignals++; #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif #ifdef KSE if (!(td->td_pflags & TDP_SA)) (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); #else (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); #endif #ifdef KSE else if (td->td_mailbox == NULL) { mtx_unlock(&ps->ps_mtx); /* UTS caused a sync signal */ p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ sigexit(td, sig); } else { mtx_unlock(&ps->ps_mtx); SIGADDSET(td->td_sigmask, sig); PROC_UNLOCK(p); error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig, sizeof(siginfo_t)); PROC_LOCK(p); /* UTS memory corrupted */ if (error) sigexit(td, SIGSEGV); mtx_lock(&ps->ps_mtx); } #endif SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching conetxt to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); PROC_SUNLOCK(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void psignal(struct proc *p, int sig) { (void) tdsignal(p, NULL, sig, NULL); } int psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td = NULL; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue")); /* * ksi_code and other fields should be set before * calling this function. */ ksi->ksi_signo = sigev->sigev_signo; ksi->ksi_value = sigev->sigev_value; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = thread_find(p, sigev->sigev_notify_thread_id); if (td == NULL) return (ESRCH); } return (tdsignal(p, td, ksi->ksi_signo, ksi)); } int tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { #ifdef KSE sigset_t saved; int ret; if (p->p_flag & P_SA) saved = p->p_sigqueue.sq_signals; ret = do_tdsignal(p, td, sig, ksi); if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) { /* pending set changed */ p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } } return (ret); } static int do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { #endif sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) #ifdef KSE panic("do_tdsignal(): invalid signal %d", sig); #else panic("tdsignal(): invalid signal %d", sig); #endif #ifdef KSE KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue")); #else KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue")); #endif /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); /* * If the signal is blocked and not destined for this thread, then * assign it to the process so that we can find it later in the first * thread that unblocks it. Otherwise, assign it to this thread now. */ if (td == NULL) { td = sigtd(p, sig, prop); if (SIGISMEMBER(td->td_sigmask, sig)) sigqueue = &p->p_sigqueue; else sigqueue = &td->td_sigqueue; } else { KASSERT(td->td_proc == p, ("invalid thread")); sigqueue = &td->td_sigqueue; } /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SA_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SA_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SA_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } PROC_SLOCK(p); sigqueue_delete_proc(p, SIGCONT); PROC_SUNLOCK(p); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* * SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediatly, such as * waking up threads so that they can cross the user boundary. * We try do the per-process part here. */ PROC_SLOCK(p); if (P_SHOULDSTOP(p)) { /* * The process is in stopped mode. All the threads should be * either winding down or already on the suspended queue. */ if (p->p_flag & P_TRACED) { /* * The traced process is already stopped, * so no further action is necessary. * No signal can restart us. */ PROC_SUNLOCK(p); goto out; } if (sig == SIGKILL) { /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SA_CONT) { /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xstat = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { #ifdef KSE /* * The process wants to catch it so it needs * to run at least one thread, but which one? * It would seem that the answer would be to * run an upcall in the next KSE to run, and * deliver the signal that way. In a NON KSE * process, we need to make sure that the * single thread is runnable asap. * XXXKSE for now however, make them all run. */ #endif /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SA_STOP) { /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ PROC_SUNLOCK(p); p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { thread_lock(td); tdsigwakeup(td, sig, action, intrval); thread_unlock(td); PROC_SUNLOCK(p); goto out; } MPASS(action == SIG_DFL); if (prop & SA_STOP) { if (p->p_flag & P_PPWAIT) { PROC_SUNLOCK(p); goto out; } p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xstat); } else PROC_SUNLOCK(p); goto out; } else goto runfast; /* NOTREACHED */ } else { /* Not in "NORMAL" state. discard the signal. */ PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: thread_lock(td); tdsigwakeup(td, sig, action, intrval); thread_unlock(td); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; register int prop; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); prop = sigprop(sig); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. */ if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) return; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); PROC_SLOCK(p); thread_lock(td); return; } /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER) sched_prio(td, PUSER); sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } } static void sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR) && !TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } else { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } } int ptracestop(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); thread_lock(td); td->td_flags |= TDF_XSIG; thread_unlock(td); td->td_xsig = sig; PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) { if (p->p_flag & P_SINGLE_EXIT) { thread_lock(td); td->td_flags &= ~TDF_XSIG; thread_unlock(td); PROC_SUNLOCK(p); return (sig); } /* * Just make wait() to work, the last stopped thread * will win. */ p->p_xstat = sig; p->p_xthread = td; p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE); sig_suspend_threads(td, p, 0); stopme: thread_suspend_switch(td); if (!(p->p_flag & P_TRACED)) { break; } if (td->td_flags & TDF_DBSUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); return (td->td_xsig); } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(td) struct thread *td; { struct proc *p; struct sigacts *ps; sigset_t sigpending; int sig, prop, newsig; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETNAND(sigpending, td->td_sigmask); if (p->p_flag & P_PPWAIT) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); sig = sig_ffs(&sigpending); if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); #ifdef KSE if (td->td_pflags & TDP_SA) SIGADDSET(td->td_sigmask, sig); #endif continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { /* * If traced, always stop. */ mtx_unlock(&ps->ps_mtx); newsig = ptracestop(td, sig); mtx_lock(&ps->ps_mtx); #ifdef KSE if (td->td_pflags & TDP_SA) SIGADDSET(td->td_sigmask, sig); #endif if (sig != newsig) { ksiginfo_t ksi; /* * clear old signal. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ sigqueue_get(&td->td_sigqueue, sig, &ksi); /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; * otherwise we just look for signals again. */ if (newsig == 0) continue; sig = newsig; /* * Put the new signal into td_sigqueue. If the * signal is being masked, look for other signals. */ SIGADDSET(td->td_sigqueue.sq_signals, sig); #ifdef KSE if (td->td_pflags & TDP_SA) SIGDELSET(td->td_sigmask, sig); #endif if (SIGISMEMBER(td->td_sigmask, sig)) continue; signotify(td); } /* * If the traced bit got turned off, go back up * to the top to rescan signals. This ensures * that p_sig* and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) continue; } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (p->p_flag & P_TRACED || (p->p_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(sig) register int sig; { struct thread *td = curthread; register struct proc *p = td->td_proc; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; int code; KASSERT(sig != 0, ("postsig")); PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); sigqueue_get(&td->td_sigqueue, sig, &ksi); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, 0); #endif if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } #ifdef KSE if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) { #else if (action == SIG_DFL) { #endif /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); sigexit(td, sig); /* NOTREACHED */ } else { #ifdef KSE if (td->td_pflags & TDP_SA) { if (sig == SIGKILL) { mtx_unlock(&ps->ps_mtx); sigexit(td, sig); } } #endif /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig), ("postsig action")); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } td->td_ru.ru_nsignals++; if (p->p_sig != sig) { code = 0; } else { code = p->p_code; p->p_code = 0; p->p_sig = 0; } #ifdef KSE if (td->td_pflags & TDP_SA) thread_signal_add(curthread, &ksi); else (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); #else (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); #endif } } /* * Kill the current process for stated reason. */ void killproc(p, why) struct proc *p; char *why; { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(td, sig) struct thread *td; int sig; { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, W_EXITCODE(0, sig)); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int status) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, status); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xstat); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason; int status = p->p_xstat; /* convert to int */ reason = CLD_EXITED; if (WCOREDUMP(status)) reason = CLD_DUMPED; else if (WIFSIGNALED(status)) reason = CLD_KILLED; /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } static char corefilename[MAXPATHLEN] = {"%N.core"}; SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, sizeof(corefilename), "process corefile name format string"); /* * expand_name(name, uid, pid) * Expand the name described in corefilename, using name, uid, and pid. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static char * expand_name(name, uid, pid) const char *name; uid_t uid; pid_t pid; { const char *format, *appendstr; char *temp; char buf[11]; /* Buffer for pid/uid -- max 4B */ size_t i, l, n; format = corefilename; temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); if (temp == NULL) return (NULL); for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': appendstr = "%"; break; case 'N': /* process name */ appendstr = name; break; case 'P': /* process id */ sprintf(buf, "%u", pid); appendstr = buf; break; case 'U': /* user id */ sprintf(buf, "%u", uid); appendstr = buf; break; default: appendstr = ""; log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); } l = strlen(appendstr); if ((n + l) >= MAXPATHLEN) goto toolong; memcpy(temp + n, appendstr, l); n += l; break; default: temp[n++] = format[i]; } } if (format[i] != '\0') goto toolong; return (temp); toolong: log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n", (long)pid, name, (u_long)uid); free(temp, M_TEMP); return (NULL); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; register struct vnode *vp; register struct ucred *cred = td->td_ucred; struct flock lf; struct nameidata nd; struct vattr vattr; int error, error1, flags, locked; struct mount *mp; char *name; /* name of corefile */ off_t limit; int vfslocked; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid); if (name == NULL) { #ifdef AUDIT audit_proc_coredump(td, NULL, EINVAL); #endif return (EINVAL); } if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) { PROC_UNLOCK(p); #ifdef AUDIT audit_proc_coredump(td, name, EFAULT); #endif free(name, M_TEMP); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(p, RLIMIT_CORE); PROC_UNLOCK(p); if (limit == 0) { #ifdef AUDIT audit_proc_coredump(td, name, EFBIG); #endif free(name, M_TEMP); return (EFBIG); } restart: NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td); flags = O_CREAT | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, NULL); if (error) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) { VOP_UNLOCK(vp, 0, td); error = EFAULT; goto close; } VOP_UNLOCK(vp, 0, td); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { lf.l_type = F_UNLCK; if (locked) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); if ((error = vn_close(vp, FWRITE, cred, td)) != 0) goto out; if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; VFS_UNLOCK_GIANT(vfslocked); goto restart; } VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_LEASE(vp, td, cred, LEASE_WRITE); VOP_SETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); error = p->p_sysent->sv_coredump ? p->p_sysent->sv_coredump(td, vp, limit) : ENOSYS; if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } close: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; out: #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(td, args) struct thread *td; struct nosys_args *args; { struct proc *p = td->td_proc; PROC_LOCK(p); psignal(p, SIGSYS); PROC_UNLOCK(p); return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(sigiop, sig, checkctty) struct sigio **sigiop; int sig, checkctty; { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(&p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(&p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); ps->ps_refcnt = 1; mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt--; if (ps->ps_refcnt == 0) { mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } else mtx_unlock(&ps->ps_mtx); } struct sigacts * sigacts_hold(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt++; mtx_unlock(&ps->ps_mtx); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { int shared; mtx_lock(&ps->ps_mtx); shared = ps->ps_refcnt > 1; mtx_unlock(&ps->ps_mtx); return (shared); } Index: head/sys/kern/uipc_mqueue.c =================================================================== --- head/sys/kern/uipc_mqueue.c (revision 175201) +++ head/sys/kern/uipc_mqueue.c (revision 175202) @@ -1,2489 +1,2488 @@ /*- * Copyright (c) 2005 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; u_int32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { 1, NULL, filt_mqdetach, filt_mqread }; struct filterops mq_wfiltops = { 1, NULL, filt_mqdetach, filt_mqwrite }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { int old, exp; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) mqfs_destroy(node); } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_type = nodetype; node->mn_refcount = 1; getnanotime(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp, struct thread *td) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags, struct thread *td) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, td); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); sx_xlock(&mqfs->mi_lock); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); sx_xunlock(&mqfs->mi_lock); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vrecycle(vp, curthread); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; int error; LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) break; } if (vd != NULL) { if (vget(vd->mv_vnode, 0, curthread) == 0) { *vpp = vd->mv_vnode; - vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE, - curthread); + vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE); return (0); } /* XXX if this can happen, we're in trouble */ } error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp); if (error) return (error); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(*vpp, mp); if (error != 0) { *vpp = NULLVP; return (error); } vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len) { struct mqfs_node *pn; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { if (strncmp(pn->mn_name, name, len) == 0) return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0, cnp->cn_thread); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ pn = mqfs_search(pd, pname, namelen); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); int rc; sx_xlock(&mqfs->mi_lock); rc = mqfs_lookupx(ap); sx_xunlock(&mqfs->mi_lock); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); #if 0 /* named node */ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen); if (pn != NULL) { mqueue_free(mq); sx_xunlock(&mqfs->mi_lock); return (EEXIST); } #else if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); #endif pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) error = ENOSPC; else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } sx_xunlock(&mqfs->mi_lock); if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp, ap->a_td); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_mode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; VATTR_NULL(vap); vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = 0; vap->va_filerev = 0; vap->va_vaflags = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(ap->a_td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqfs_node *pn; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); pn = VTON(vp); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); #if 0 /* named node */ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen); if (pn != NULL) { sx_xunlock(&mqfs->mi_lock); return (EEXIST); } #else if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); #endif pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn == NULL) error = ENOSPC; else error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF); knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL); knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); FREE(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { FREE(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { FREE(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ets, ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; error = copyin(abs_timeout, &ets, sizeof(ets)); if (error != 0) goto bad; if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { ts2 = ets; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct proc *p; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; PROC_LOCK(p); if (!KSI_ONQ(&nt->nt_ksi)) psignal_event(p, &nt->nt_sigev, &nt->nt_ksi); PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ets, ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); error = copyin(abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { ts2 = ets; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } /* * Syscall to open a message queue. */ int kmq_open(struct thread *td, struct kmq_open_args *uap) { char path[MQFS_NAMELEN + 1]; struct mq_attr attr, *pattr; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, flags, cmode; if ((uap->flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); fdp = td->td_proc->p_fd; flags = FFLAGS(uap->flags); cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) && (uap->attr != NULL)) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg) return (EINVAL); if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize) return (EINVAL); pattr = &attr; } else pattr = NULL; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); error = falloc(td, &fp, &fd); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(pattr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { int acc_mode = 0; if (flags & FREAD) acc_mode |= VREAD; if (flags & FWRITE) acc_mode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, acc_mode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(fdp, fp, fd, td); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[fd] == fp) fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to unlink a message queue. */ int kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_write, fpp, ppn, pmq); } int kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mqueue *mq; struct file *fp; struct mq_attr attr, oattr; u_int oflag, flag; int error; if (uap->attr) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); if (attr.mq_flags & ~O_NONBLOCK) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); oattr.mq_maxmsg = mq->mq_maxmsg; oattr.mq_msgsize = mq->mq_msgsize; oattr.mq_curmsgs = mq->mq_curmsgs; if (uap->attr) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr.mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr.mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); if (uap->oattr) error = copyout(&oattr, uap->oattr, sizeof(oattr)); return (error); } int kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; int error; int waitok; error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, uap->abs_timeout); fdrop(fp, td); return (error); } int kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; int error, waitok; error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, uap->abs_timeout); fdrop(fp, td); return (error); } int kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev; struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp; struct mqueue_notifier *nt, *newnt = NULL; int error; p = td->td_proc; fdp = td->td_proc->p_fd; if (uap->sigev) { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error) return (error); if (ev.sigev_notify != SIGEV_SIGNAL && ev.sigev_notify != SIGEV_THREAD_ID && ev.sigev_notify != SIGEV_NONE) return (EINVAL); if ((ev.sigev_notify == SIGEV_SIGNAL || ev.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(ev.sigev_signo)) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); if (fget_locked(fdp, uap->mqd) != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (uap->sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, uap->mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = uap->mqd; notifier_insert(p, nt); } nt->nt_sigev = ev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, uap->mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct filedesc *fdp; struct mqueue *mq; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int mqf_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); st->st_atimespec = pn->mn_atime; st->st_mtimespec = pn->mn_mtime; st->st_ctimespec = pn->mn_ctime; st->st_birthtimespec = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; return (0); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static struct fileops mqueueops = { .fo_read = mqf_read, .fo_write = mqf_write, .fo_truncate = mqf_truncate, .fo_ioctl = mqf_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; SYSCALL_MODULE_HELPER(kmq_open); SYSCALL_MODULE_HELPER(kmq_setattr); SYSCALL_MODULE_HELPER(kmq_timedsend); SYSCALL_MODULE_HELPER(kmq_timedreceive); SYSCALL_MODULE_HELPER(kmq_notify); SYSCALL_MODULE_HELPER(kmq_unlink); VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC); MODULE_VERSION(mqueuefs, 1); Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 175201) +++ head/sys/kern/uipc_syscalls.c (revision 175202) @@ -1,2629 +1,2629 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif /* SCTP */ static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, struct accept_args *uap, int compat); static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); /* * NSFBUFS-related variables and associated sysctls */ int nsfbufs; int nsfbufspeak; int nsfbufsused; SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, "Number of sendfile(2) sf_bufs at peak usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, "Number of sendfile(2) sf_bufs in use"); /* * Convert a user file descriptor to a kernel file entry. A reference on the * file entry is held upon returning. This is lighter weight than * fgetsock(), which bumps the socket reference drops the file reference * count instead, as this approach avoids several additional mutex operations * associated with the additional reference count. If requested, return the * open file flags. */ static int getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp) { struct file *fp; int error; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_SLOCK(fdp); fp = fget_locked(fdp, fd); if (fp == NULL) error = EBADF; else if (fp->f_type != DTYPE_SOCKET) { fp = NULL; error = ENOTSOCK; } else { fhold(fp); if (fflagp != NULL) *fflagp = fp->f_flag; error = 0; } FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (error); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int socket(td, uap) struct thread *td; struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type, uap->protocol); if (error) return (error); #endif fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, uap->type, uap->protocol, td->td_ucred, td); if (error) { fdclose(fdp, fp, fd, td); } else { finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int bind(td, uap) struct thread *td; struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) return (error); error = kern_bind(td, uap->s, sa); free(sa, M_SONAME); return (error); } int kern_bind(td, fd, sa) struct thread *td; int fd; struct sockaddr *sa; { struct socket *so; struct file *fp; int error; error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_bind(td->td_ucred, so, sa); SOCK_UNLOCK(so); if (error) goto done; #endif error = sobind(so, sa, td); #ifdef MAC done: #endif fdrop(fp, td); return (error); } /* ARGSUSED */ int listen(td, uap) struct thread *td; struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; int error; error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_listen(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto done; #endif error = solisten(so, uap->backlog, td); #ifdef MAC done: #endif fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, uap, compat) struct thread *td; struct accept_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; } */ *uap; int compat; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uap->name == NULL) return (kern_accept(td, uap->s, NULL, NULL, NULL)); error = copyin(uap->anamelen, &namelen, sizeof (namelen)); if (error) return (error); error = kern_accept(td, uap->s, &name, &namelen, &fp); /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (error) { (void) copyout(&namelen, uap->anamelen, sizeof(*uap->anamelen)); return (error); } if (error == 0 && name != NULL) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uap->name, namelen); } if (error == 0) error = copyout(&namelen, uap->anamelen, sizeof(namelen)); if (error) fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { struct filedesc *fdp; struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; int error; struct socket *head, *so; int fd; u_int fflag; pid_t pgid; int tmp; if (name) { *name = NULL; if (*namelen < 0) return (EINVAL); } fdp = td->td_proc->p_fd; error = getsock(fdp, s, &headfp, &fflag); if (error) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC SOCK_LOCK(head); error = mac_socket_check_accept(td->td_ucred, head); SOCK_UNLOCK(head); if (error != 0) goto done; #endif error = falloc(td, &nfp, &fd); if (error) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; error = soaccept(so, &sa); if (error) { /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (name) *namelen = 0; goto noconnection; } if (sa == NULL) { if (name) *namelen = 0; goto done; } if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; *name = sa; sa = NULL; } noconnection: if (sa) FREE(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int connect(td, uap) struct thread *td; struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error) return (error); error = kern_connect(td, uap->s, sa); free(sa, M_SONAME); return (error); } int kern_connect(td, fd, sa) struct thread *td; int fd; struct sockaddr *sa; { struct socket *so; struct file *fp; int error; int interrupted = 0; error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_connect(td->td_ucred, so, sa); SOCK_UNLOCK(so); if (error) goto bad; #endif error = soconnect(so, sa, td); if (error) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int socketpair(td, uap) struct thread *td; struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type, uap->protocol); if (error) return (error); #endif error = socreate(uap->domain, &so1, uap->type, uap->protocol, td->td_ucred, td); if (error) return (error); error = socreate(uap->domain, &so2, uap->type, uap->protocol, td->td_ucred, td); if (error) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd); if (error) goto free2; sv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd); if (error) goto free3; fp2->f_data = so2; /* so2 already has ref count */ sv[1] = fd; error = soconnect2(so1, so2); if (error) goto free4; if (uap->type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error) goto free4; } finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops); so1 = so2 = NULL; error = copyout(sv, uap->rsv, 2 * sizeof (int)); if (error) goto free4; fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(fdp, fp2, sv[1], td); fdrop(fp2, td); free3: fdclose(fdp, fp1, sv[0], td); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } static int sendit(td, s, mp, flags) struct thread *td; int s; struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_TRYWAIT); if (control == 0) { error = ENOBUFS; goto bad; } else { cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: if (to) FREE(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; int i; int len, error; #ifdef KTRACE struct uio *ktruio = NULL; #endif error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error) return (error); so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sendto(td, uap) struct thread *td; struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; error = sendit(td, uap->s, &msg, uap->flags); return (error); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; error = sendit(td, uap->s, &msg, uap->flags); return (error); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; int i; socklen_t len; int error; struct mbuf *m, *control = 0; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = 0; #ifdef KTRACE struct uio *ktruio = NULL; #endif if(controlp != NULL) *controlp = 0; error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error) return (error); so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(td->td_ucred, so); SOCK_UNLOCK(so); if (error) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = (int)len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error) goto out; td->td_retval[0] = (int)len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == 0) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); if (fromsa) FREE(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error) return (error); if (namelenp) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int recvfrom(td, uap) struct thread *td; struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return(error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, NULL); return (error); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int shutdown(td, uap) struct thread *td; struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; int error; error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); fdrop(fp, td); } return (error); } /* ARGSUSED */ int setsockopt(td, uap) struct thread *td; struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { int error; struct socket *so; struct file *fp; struct sockopt sopt; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } /* ARGSUSED */ int getsockopt(td, uap) struct thread *td; struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { int error; struct socket *so; struct file *fp; struct sockopt sopt; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; if (*alen < 0) return (EINVAL); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; *sa = NULL; error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); if (error) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; bad: fdrop(fp, td); if (error && *sa) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; if (*alen < 0) return (EINVAL); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); if (error) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; bad: if (error && *sa) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { struct sockaddr *sa; struct mbuf *m; int error; if ((u_int)buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && (u_int)buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if ((u_int)buflen > MCLBYTES) return (EINVAL); } m = m_get(M_TRYWAIT, type); if (m == NULL) return (ENOBUFS); if ((u_int)buflen > MLEN) { MCLGET(m, M_TRYWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (ENOBUFS); } } m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error) { FREE(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } /* * Detach mapped page and release resources back to the system. */ void sf_buf_mext(void *addr, void *args) { vm_page_t m; m = sf_buf_page(args); sf_buf_free(args); vm_page_lock_queues(); vm_page_unwire(m, 0); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (m->wire_count == 0 && m->object == NULL) vm_page_free(m); vm_page_unlock_queues(); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sendfile(struct thread *td, struct sendfile_args *uap) { return (do_sendfile(td, uap, 0)); } static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; int error; hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error) goto out; } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error) goto out; } } error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat); out: if (hdr_uio) free(hdr_uio, M_IOV); if (trl_uio) free(trl_uio, M_IOV); return (error); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (do_sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ int kern_sendfile(struct thread *td, struct sendfile_args *uap, struct uio *hdr_uio, struct uio *trl_uio, int compat) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj = NULL; struct socket *so = NULL; struct mbuf *m = NULL; struct sf_buf *sf; struct vm_page *pg; off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0; int error, hdrlen = 0, mnw = 0; int vfslocked; /* * The file descriptor must be a regular file and have a * backing VM object. * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) goto out; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); obj = vp->v_object; if (obj != NULL) { /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ VM_OBJECT_LOCK(obj); if ((obj->flags & OBJ_DEAD) == 0) { vm_object_reference_locked(obj); VM_OBJECT_UNLOCK(obj); } else { VM_OBJECT_UNLOCK(obj); obj = NULL; } } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); if (obj == NULL) { error = EINVAL; goto out; } if (uap->offset < 0) { error = EINVAL; goto out; } /* * The socket must be a stream socket and connected. * Remember if it a blocking or non-blocking socket. */ if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, NULL)) != 0) goto out; so = sock_fp->f_data; if (so->so_type != SOCK_STREAM) { error = EINVAL; goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto out; } /* * Do not wait on memory allocations but return ENOMEM for * caller to retry later. * XXX: Experimental. */ if (uap->flags & SF_MNOWAIT) mnw = 1; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto out; #endif /* If headers are specified copy them into mbufs. */ if (hdr_uio != NULL) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; if (hdr_uio->uio_resid > 0) { /* * In FBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 0, 0, 0); if (m == NULL) { error = mnw ? EAGAIN : ENOBUFS; goto out; } hdrlen = m_length(m, NULL); } } /* Protect against multiple writers to the socket. */ (void) sblock(&so->so_snd, M_WAITOK); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = uap->offset, rem = uap->nbytes; ; ) { int loopbytes = 0; int space = 0; int done = 0; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * Reduce space in the socket buffer by the size of * the header mbuf chain. * hdrlen is set to 0 after the first loop. */ space -= hdrlen; /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ while(space > loopbytes) { vm_pindex_t pindex; vm_offset_t pgoff; struct mbuf *m0; VM_OBJECT_LOCK(obj); /* * Calculate the amount to transfer. * Not to exceed a page, the EOF, * or the passed in nbytes. */ pgoff = (vm_offset_t)(off & PAGE_MASK); xfsize = omin(PAGE_SIZE - pgoff, obj->un_pager.vnp.vnp_size - uap->offset - fsbytes - loopbytes); if (uap->nbytes) rem = (uap->nbytes - fsbytes - loopbytes); else rem = obj->un_pager.vnp.vnp_size - uap->offset - fsbytes - loopbytes; xfsize = omin(rem, xfsize); if (xfsize <= 0) { VM_OBJECT_UNLOCK(obj); done = 1; /* all data sent */ break; } /* * Don't overflow the send buffer. * Stop here and send out what we've * already got. */ if (space < loopbytes + xfsize) { VM_OBJECT_UNLOCK(obj); break; } /* * Attempt to look up the page. Allocate * if not found or wait and loop if busy. */ pindex = OFF_TO_IDX(off); pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY); /* * Check if page is valid for what we need, * otherwise initiate I/O. * If we already turned some pages into mbufs, * send them off before we come here again and * block. */ if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) VM_OBJECT_UNLOCK(obj); else if (m != NULL) error = EAGAIN; /* send what we already got */ else if (uap->flags & SF_NODISKIO) error = EBUSY; else { int bsize, resid; /* * Ensure that our page is still around * when the I/O completes. */ vm_page_io_start(pg); VM_OBJECT_UNLOCK(obj); /* * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); /* * XXXMAC: Because we don't have fp->f_cred * here, we pass in NOCRED. This is probably * wrong, but is consistent with our original * implementation. */ error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); VM_OBJECT_LOCK(obj); vm_page_io_finish(pg); if (!error) VM_OBJECT_UNLOCK(obj); mbstat.sf_iocnt++; } if (error) { vm_page_lock_queues(); vm_page_unwire(pg, 0); /* * See if anyone else might know about * this page. If not and it is not valid, * then free it. */ if (pg->wire_count == 0 && pg->valid == 0 && pg->busy == 0 && !(pg->oflags & VPO_BUSY) && pg->hold_count == 0) { vm_page_free(pg); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (error == EAGAIN) error = 0; /* not a real error */ break; } /* * Get a sendfile buf. We usually wait as long * as necessary, but this wait can be interrupted. */ if ((sf = sf_buf_alloc(pg, (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) { mbstat.sf_allocfail++; vm_page_lock_queues(); vm_page_unwire(pg, 0); /* * XXX: Not same check as above!? */ if (pg->wire_count == 0 && pg->object == NULL) vm_page_free(pg); vm_page_unlock_queues(); error = (mnw ? EAGAIN : EINTR); break; } /* * Get an mbuf and set it up as having * external storage. */ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); if (m0 == NULL) { error = (mnw ? EAGAIN : ENOBUFS); sf_buf_mext((void *)sf_buf_kva(sf), sf); break; } MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY, EXT_SFBUF); m0->m_data = (char *)sf_buf_kva(sf) + pgoff; m0->m_len = xfsize; /* Append to mbuf chain. */ if (m != NULL) m_cat(m, m0); else m = m0; /* Keep track of bits processed. */ loopbytes += xfsize; off += xfsize; } /* Add the buffer chain to the socket buffer. */ if (m != NULL) { int mlen, err; mlen = m_length(m, NULL); SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } SOCKBUF_UNLOCK(&so->so_snd); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); if (err == 0) { /* * We need two counters to get the * file offset and nbytes to send * right: * - sbytes contains the total amount * of bytes sent, including headers. * - fsbytes contains the total amount * of bytes sent from the file. */ sbytes += mlen; fsbytes += mlen; if (hdrlen) { fsbytes -= hdrlen; hdrlen = 0; } } else if (error == 0) error = err; m = NULL; /* pru_send always consumes */ } /* Quit outer loop on error or when we're done. */ if (error || done) goto done; } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { error = kern_writev(td, uap->s, trl_uio); if (error) goto done; sbytes += td->td_retval[0]; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); } if (obj != NULL) vm_object_deallocate(obj); if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (error == ERESTART) error = EINTR; return (error); } /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #ifdef SCTP struct filedesc *fdp; struct file *nfp = NULL; int error; struct socket *head, *so; int fd; u_int fflag; fdp = td->td_proc->p_fd; error = fgetsock(td, uap->sd, &head, &fflag); if (error) goto done2; error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error) goto done2; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd); if (error) goto done; td->td_retval[0] = fd; so = sonewconn(head, SS_ISCONNECTED); if (so == NULL) goto noconnection; /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); soref(so); /* file descriptor reference */ SOCK_UNLOCK(so); ACCEPT_LOCK(); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_state &= ~SS_NOFDREF; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. */ done: if (nfp != NULL) fdrop(nfp, td); fputsock(head); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #ifdef SCTP struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; int use_rcvinfo = 1; int error = 0, len; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error) return (error); u_sinfo = &sinfo; } if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } } error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) goto sctp_bad; iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; len = auio.uio_resid = uap->mlen; error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, use_rcvinfo, u_sinfo, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: if (fp) fdrop(fp, td); sctp_bad2: if (to) free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #ifdef SCTP struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; int use_rcvinfo = 1; int error=0, len, i; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error) return (error); u_sinfo = &sinfo; } if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } } error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) goto sctp_bad1; error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error) goto sctp_bad1; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } len = auio.uio_resid; error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, use_rcvinfo, u_sinfo, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: if (fp) fdrop(fp, td); sctp_bad2: if (to) free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #ifdef SCTP u_int8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp = NULL; struct sockaddr *fromsa; int fromlen; int len, i, msg_flags; int error = 0; #ifdef KTRACE struct uio *ktruio = NULL; #endif error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) { return (error); } error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error) { goto out1; } so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(td->td_ucred, so); SOCK_UNLOCK(so); if (error) { goto out; return (error); } #endif /* MAC */ if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error) { goto out; } } else { fromlen = 0; } if(uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error) { goto out; } } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = (int)len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error) goto out; td->td_retval[0] = (int)len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == 0) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (unsigned)len); if (error) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error) { goto out; } } if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error) { goto out; } } out: free(iov, M_IOV); out1: if (fp) fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: head/sys/kern/vfs_acl.c =================================================================== --- head/sys/kern/vfs_acl.c (revision 175201) +++ head/sys/kern/vfs_acl.c (revision 175202) @@ -1,431 +1,431 @@ /*- * Copyright (c) 1999-2006 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Developed by the TrustedBSD Project. * * ACL system calls and other functions common across different ACL types. * Type-specific routines go into subr_acl_.c. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uma_zone_t acl_zone; static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); /* * These calls wrap the real vnode operations, and are called by the syscall * code once the syscall has converted the path or file descriptor to a vnode * (unlocked). The aclp pointer is assumed still to point to userland, so * this should not be consumed within the kernel except by syscall code. * Other code should directly invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; struct mount *mp; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setacl(td->td_ucred, vp, type, &inkernacl); if (error != 0) goto out; #endif error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_getacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0, td); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { struct mount *mp; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_deleteacl(td->td_ucred, vp, type); if (error) goto out; #endif error = VOP_SETACL(vp, type, 0, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't * need to lock, as the vacl_ code will get/release any locks required. */ /* * Given a file path, get an ACL for it */ int __acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, get an ACL for it; don't follow links. */ int __acl_get_link(struct thread *td, struct __acl_get_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, set an ACL for it. */ int __acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, set an ACL for it; don't follow links. */ int __acl_set_link(struct thread *td, struct __acl_set_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file descriptor, get an ACL for it. */ int __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* * Given a file descriptor, set an ACL for it. */ int __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, uap->type); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, delete an ACL from it; don't follow links. */ int __acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, uap->type); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_delete(td, fp->f_vnode, uap->type); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* * Given a file path, check an ACL for it. */ int __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, check an ACL for it; don't follow links. */ int __acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file descriptor, check an ACL for it. */ int __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* ARGUSED */ static void aclinit(void *dummy __unused) { acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL) Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 175201) +++ head/sys/kern/vfs_aio.c (revision 175202) @@ -1,2323 +1,2323 @@ /*- * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vfs_aio.h" /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #define JOBST_NULL 0 #define JOBST_JOBQSOCK 1 #define JOBST_JOBQGLOBAL 2 #define JOBST_JOBRUNNING 3 #define JOBST_JOBFINISHED 4 #define JOBST_JOBQBUF 5 #define JOBST_JOBQSYNC 6 #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef MAX_AIO_PROCS #define MAX_AIO_PROCS 32 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef TARGET_AIO_PROCS #define TARGET_AIO_PROCS 4 #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif #ifndef AIOD_TIMEOUT_DEFAULT #define AIOD_TIMEOUT_DEFAULT (10 * hz) #endif #ifndef AIOD_LIFETIME_DEFAULT #define AIOD_LIFETIME_DEFAULT (30 * hz) #endif static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel threads to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel threads for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel threads for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); /* Number of async I/O thread in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_timeout; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, "Timeout value for synchronous aio operations"); static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int unloadable = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of aio (not recommended)"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process (stored in the process)"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process (stored in the process)"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process (stored in the process)"); typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; /* * Below is a key of locks used to protect each member of struct aiocblist * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * Current, there is only two backends: BIO and generic file I/O. * socket I/O is served by generic file I/O, this is not a good idea, since * disk file I/O and any other types without O_NONBLOCK flag can block daemon * threads, if there is no thread to serve socket I/O, the socket I/O will be * delayed too long or starved, we should create some threads dedicated to * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O * systems we really need non-blocking interface, fiddling O_NONBLOCK in file * structure is not safe because there is race between userland and aio * daemons. */ struct aiocblist { TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */ TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */ TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */ int jobflags; /* (a) job flags */ int jobstate; /* (b) job state */ int inputcharge; /* (*) input blockes */ int outputcharge; /* (*) output blockes */ struct buf *bp; /* (*) private to BIO backend, * buffer pointer */ struct proc *userproc; /* (*) user process */ struct ucred *cred; /* (*) active credential when created */ struct file *fd_file; /* (*) pointer to file structure */ struct aioliojob *lio; /* (*) optional lio job */ struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */ struct knlist klist; /* (a) list of knotes */ struct aiocb uaiocb; /* (*) kernel I/O control block */ ksiginfo_t ksi; /* (a) realtime signal info */ struct task biotask; /* (*) private to BIO backend */ uint64_t seqno; /* (*) job number */ int pending; /* (a) number of pending I/O, aio_fsync only */ }; /* jobflags */ #define AIOCBLIST_DONE 0x01 #define AIOCBLIST_BUFDONE 0x02 #define AIOCBLIST_RUNDOWN 0x04 #define AIOCBLIST_CHECKSYNC 0x08 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aiothreadlist { int aiothreadflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */ struct thread *aiothread; /* (*) the AIO thread */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_maxactive_count; /* (*) maximum number of AIOs */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ int kaio_count; /* (a) size of AIO queue */ int kaio_ballowed_count; /* (*) maximum number of buffers */ int kaio_buffer_count; /* (a) number of physio buffers */ TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */ TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets, * NOT USED YET. */ TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */ struct task kaio_task; /* (*) task to kick aio threads */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static struct mtx aio_sock_mtx; static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static void aio_onceonly(void); static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lio, int type, int osigev); static void aio_physwakeup(struct buf *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void biohelper(void *, int); static void aio_daemon(void *param); static void aio_swake_cb(struct socket *, struct sockbuf *); static int aio_unload(void); static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type); #define DONE_BUF 1 #define DONE_QUEUE 2 static int do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io thread data * aiocb async io jobs * aiol list io job pointer - internal to aio_suspend XXX * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { 0, filt_aioattach, filt_aiodetach, filt_aio }; static struct filterops lio_filtops = { 0, filt_lioattach, filt_liodetach, filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_bio); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_UNLOAD: error = aio_unload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; SYSCALL_MODULE_HELPER(aio_cancel); SYSCALL_MODULE_HELPER(aio_error); SYSCALL_MODULE_HELPER(aio_fsync); SYSCALL_MODULE_HELPER(aio_read); SYSCALL_MODULE_HELPER(aio_return); SYSCALL_MODULE_HELPER(aio_suspend); SYSCALL_MODULE_HELPER(aio_waitcomplete); SYSCALL_MODULE_HELPER(aio_write); SYSCALL_MODULE_HELPER(lio_listio); SYSCALL_MODULE_HELPER(oaio_read); SYSCALL_MODULE_HELPER(oaio_write); SYSCALL_MODULE_HELPER(olio_listio); DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static void aio_onceonly(void) { /* XXX: should probably just use so->callback */ aio_swake = &aio_swake_cb; exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; async_io_version = _POSIX_VERSION; p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); } /* * Callback for unload of AIO when used as a module. */ static int aio_unload(void) { int error; /* * XXX: no unloads by default, it's too dangerous. * perhaps we could do it if locked out callers and then * did an aio_proc_rundown() on each process. * * jhb: aio_proc_rundown() needs to run on curproc though, * so I don't think that would fly. */ if (!unloadable) return (EOPNOTSUPP); error = kqueue_del_filteropts(EVFILT_AIO); if (error) return error; error = kqueue_del_filteropts(EVFILT_LIO); if (error) return error; async_io_version = 0; aio_swake = NULL; taskqueue_free(taskqueue_aiod_bio); delete_unrhdr(aiod_unr); uma_zdestroy(kaio_zone); uma_zdestroy(aiop_zone); uma_zdestroy(aiocb_zone); uma_zdestroy(aiol_zone); uma_zdestroy(aiolio_zone); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); EVENTHANDLER_DEREGISTER(process_exec, exec_tag); mtx_destroy(&aio_job_mtx); mtx_destroy(&aio_sock_mtx); sema_destroy(&aio_newproc_sem); p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF); ki->kaio_flags = 0; ki->kaio_maxactive_count = max_aio_per_proc; ki->kaio_active_count = 0; ki->kaio_qallowed_count = max_aio_queue_per_proc; ki->kaio_count = 0; ki->kaio_ballowed_count = max_buf_aio; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_bufqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_sockqueue); TAILQ_INIT(&ki->kaio_syncqueue); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < target_aio_procs) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { int ret = 0; PROC_LOCK(p); if (!KSI_ONQ(ksi)) { ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; ret = psignal_event(p, sigev, ksi); } PROC_UNLOCK(p); return (ret); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct aiocblist *aiocbe) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = aiocbe->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(aiocbe->jobstate == JOBST_JOBFINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist); TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); lj = aiocbe->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* aiocbe is going away, we need to destroy any knotes */ knlist_delete(&aiocbe->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&aiocbe->ksi); PROC_UNLOCK(p); MPASS(aiocbe->bp == NULL); aiocbe->jobstate = JOBST_NULL; AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * an aiocblist from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ fdrop(aiocbe->fd_file, curthread); crfree(aiocbe->cred); uma_zfree(aiocb_zone, aiocbe); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct aiocblist *cbe, *cbn; struct file *fp; struct socket *so; int remove; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { remove = 0; mtx_lock(&aio_job_mtx); if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSOCK) { fp = cbe->fd_file; MPASS(fp->f_type == DTYPE_SOCKET); so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSYNC) { TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); remove = 1; } mtx_unlock(&aio_job_mtx); if (remove) { cbe->jobstate = JOBST_JOBFINISHED; cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); aio_bio_done_notify(p, cbe, DONE_QUEUE); } } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_bufqueue) || TAILQ_FIRST(&ki->kaio_jobqueue)) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(cbe); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct aiocblist * aio_selectjob(struct aiothreadlist *aiop) { struct aiocblist *aiocbe; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_FOREACH(aiocbe, &aio_jobs, list) { userp = aiocbe->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < ki->kaio_maxactive_count) { TAILQ_REMOVE(&aio_jobs, aiocbe, list); /* Account for currently active jobs. */ ki->kaio_active_count++; aiocbe->jobstate = JOBST_JOBRUNNING; break; } } return (aiocbe); } /* * Move all data to a permanent storage device, this code * simulates fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_object != NULL) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * The AIO processing activity. This is the code that does the I/O request for * the non-physio version of the operations. The normal vn operations are used, * and this code should work in all instances for every type of file, including * pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process(struct aiocblist *aiocbe) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; struct socket *so; struct uio auio; struct iovec aiov; int cnt; int error; int oublock_st, oublock_end; int inblock_st, inblock_end; td = curthread; td_savedcred = td->td_ucred; td->td_ucred = aiocbe->cred; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; if (cb->aio_lio_opcode == LIO_SYNC) { error = 0; cnt = 0; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); cb->_aiocb_private.error = error; cb->_aiocb_private.status = 0; td->td_ucred = td_savedcred; return; } aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; if (auio.uio_resid == 0) error = 0; else error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; aiocbe->inputcharge = inblock_end - inblock_st; aiocbe->outputcharge = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { int sigpipe = 1; if (fp->f_type == DTYPE_SOCKET) { so = fp->f_data; if (so->so_options & SO_NOSIGPIPE) sigpipe = 0; } if (sigpipe) { PROC_LOCK(aiocbe->userproc); psignal(aiocbe->userproc, SIGPIPE); PROC_UNLOCK(aiocbe->userproc); } } } cnt -= auio.uio_resid; cb->_aiocb_private.error = error; cb->_aiocb_private.status = cnt; td->td_ucred = td_savedcred; } static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type) { struct aioliojob *lj; struct kaioinfo *ki; struct aiocblist *scb, *scbn; int lj_done; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = aiocbe->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } if (type == DONE_QUEUE) { aiocbe->jobflags |= AIOCBLIST_DONE; } else { aiocbe->jobflags |= AIOCBLIST_BUFDONE; } TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist); aiocbe->jobstate = JOBST_JOBFINISHED; if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi); KNOTE_LOCKED(&aiocbe->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) { TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) { if (aiocbe->fd_file == scb->fd_file && aiocbe->seqno < scb->seqno) { if (--scb->pending == 0) { mtx_lock(&aio_job_mtx); scb->jobstate = JOBST_JOBQGLOBAL; TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list); TAILQ_INSERT_TAIL(&aio_jobs, scb, list); aio_kick_nowait(userp); mtx_unlock(&aio_job_mtx); } } } } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } /* * The AIO daemon, most of the actual work is done in aio_process, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct aiocblist *aiocbe; struct aiothreadlist *aiop; struct kaioinfo *ki; struct proc *curcp, *mycp, *userp; struct vmspace *myvm, *tmpvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Local copies of curproc (cp) and vmspace (myvm) */ mycp = td->td_proc; myvm = mycp->p_vmspace; KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aiothread = td; aiop->aiothreadflags = 0; /* The daemon resides in its own pgrp. */ setsid(td, NULL); /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * curcp is the current daemon process context. * userp is the current user process context. */ curcp = mycp; /* * Take daemon off of free queue */ if (aiop->aiothreadflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((aiocbe = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); userp = aiocbe->userproc; /* * Connect to process address space for user program. */ if (userp != curcp) { /* * Save the current address space that we are * connected to. */ tmpvm = mycp->p_vmspace; /* * Point to the new user address space, and * refer to it. */ mycp->p_vmspace = userp->p_vmspace; atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); /* * If the old address space wasn't the daemons * own address space, then we need to remove the * daemon's reference from the other process * that it was acting on behalf of. */ if (tmpvm != myvm) { vmspace_free(tmpvm); } curcp = userp; } ki = userp->p_aioinfo; /* Do the I/O function. */ aio_process(aiocbe); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; mtx_unlock(&aio_job_mtx); AIO_LOCK(ki); TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); aio_bio_done_notify(userp, aiocbe, DONE_QUEUE); AIO_UNLOCK(ki); mtx_lock(&aio_job_mtx); } /* * Disconnect from user address space. */ if (curcp != mycp) { mtx_unlock(&aio_job_mtx); /* Get the user address space to disconnect from. */ tmpvm = mycp->p_vmspace; /* Get original address space for daemon. */ mycp->p_vmspace = myvm; /* Activate the daemon's address space. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); #ifdef DIAGNOSTIC if (tmpvm == myvm) { printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); } #endif /* Remove our vmspace reference. */ vmspace_free(tmpvm); curcp = mycp; mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected, that should be * curcp == mycp. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aiothreadflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime)) { if (TAILQ_EMPTY(&aio_jobs)) { if ((aiop->aiothreadflags & AIOP_FREE) && (num_aio_procs > target_aio_procs)) { TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); #ifdef DIAGNOSTIC if (mycp->p_vmspace->vm_refcnt <= 1) { printf("AIOD: bad vm refcnt for" " exiting daemon: %d\n", mycp->p_vmspace->vm_refcnt); } #endif kproc_exit(0); } } } } mtx_unlock(&aio_job_mtx); panic("shouldn't be here\n"); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead physio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qphysio(struct proc *p, struct aiocblist *aiocbe) { struct aiocb *cb; struct file *fp; struct buf *bp; struct vnode *vp; struct kaioinfo *ki; struct aioliojob *lj; int error; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; if (fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; /* * If its not a disk, we don't want to return a positive error. * It causes the aio code to not fall through to try the thread * way when you're talking to a regular file. */ if (!vn_isdisk(vp, &error)) { if (error == ENOTBLK) return (-1); else return (error); } if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); if (cb->aio_nbytes > vp->v_rdev->si_iosize_max) return (-1); if (cb->aio_nbytes > MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) return (-1); ki = p->p_aioinfo; if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) return (-1); /* Create and build a buffer header for a transfer. */ bp = (struct buf *)getpbuf(NULL); BUF_KERNPROC(bp); AIO_LOCK(ki); ki->kaio_count++; ki->kaio_buffer_count++; lj = aiocbe->lio; if (lj) lj->lioj_count++; AIO_UNLOCK(ki); /* * Get a copy of the kva from the physical buffer. */ error = 0; bp->b_bcount = cb->aio_nbytes; bp->b_bufsize = cb->aio_nbytes; bp->b_iodone = aio_physwakeup; bp->b_saveaddr = bp->b_data; bp->b_data = (void *)(uintptr_t)cb->aio_buf; bp->b_offset = cb->aio_offset; bp->b_iooffset = cb->aio_offset; bp->b_blkno = btodb(cb->aio_offset); bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; /* * Bring buffer into kernel space. */ if (vmapbuf(bp) < 0) { error = EFAULT; goto doerror; } AIO_LOCK(ki); aiocbe->bp = bp; bp->b_caller1 = (void *)aiocbe; TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); aiocbe->jobstate = JOBST_JOBQBUF; cb->_aiocb_private.status = cb->aio_nbytes; AIO_UNLOCK(ki); atomic_add_int(&num_queue_count, 1); atomic_add_int(&num_buf_aio, 1); bp->b_error = 0; TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe); /* Perform transfer. */ dev_strategy(vp->v_rdev, bp); return (0); doerror: AIO_LOCK(ki); ki->kaio_count--; ki->kaio_buffer_count--; if (lj) lj->lioj_count--; aiocbe->bp = NULL; AIO_UNLOCK(ki); relpbuf(bp, NULL); return (error); } /* * Wake up aio requests that may be serviceable now. */ static void aio_swake_cb(struct socket *so, struct sockbuf *sb) { struct aiocblist *cb, *cbn; int opcode; if (sb == &so->so_snd) opcode = LIO_WRITE; else opcode = LIO_READ; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_AIO; mtx_lock(&aio_job_mtx); TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) { if (opcode == cb->uaiocb.aio_lio_opcode) { if (cb->jobstate != JOBST_JOBQSOCK) panic("invalid queue value"); /* XXX * We don't have actual sockets backend yet, * so we simply move the requests to the generic * file I/O backend. */ TAILQ_REMOVE(&so->so_aiojobq, cb, list); TAILQ_INSERT_TAIL(&aio_jobs, cb, list); aio_kick_nowait(cb->userproc); } } mtx_unlock(&aio_job_mtx); SOCKBUF_UNLOCK(sb); } /* * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, int type, int oldsigev) { struct proc *p = td->td_proc; struct file *fp; struct socket *so; struct aiocblist *aiocbe, *cb; struct kaioinfo *ki; struct kevent kev; struct sockbuf *sb; int opcode; int error; int fd, kqfd; int jid; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; suword(&job->_aiocb_private.status, -1); suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.kernelinfo, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= ki->kaio_qallowed_count) { suword(&job->_aiocb_private.error, EAGAIN); return (EAGAIN); } aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); aiocbe->inputcharge = 0; aiocbe->outputcharge = 0; knlist_init(&aiocbe->klist, AIO_MTX(ki), NULL, NULL, NULL); if (oldsigev) { bzero(&aiocbe->uaiocb, sizeof(struct aiocb)); error = copyin(job, &aiocbe->uaiocb, sizeof(struct oaiocb)); bcopy(&aiocbe->uaiocb.__spare__, &aiocbe->uaiocb.aio_sigevent, sizeof(struct osigevent)); } else { error = copyin(job, &aiocbe->uaiocb, sizeof(struct aiocb)); } if (error) { suword(&job->_aiocb_private.error, error); uma_zfree(aiocb_zone, aiocbe); return (error); } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { suword(&job->_aiocb_private.error, EINVAL); uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } ksiginfo_init(&aiocbe->ksi); /* Save userspace address of the job info. */ aiocbe->uuaiocb = job; /* Get the opcode. */ if (type != LIO_NOP) aiocbe->uaiocb.aio_lio_opcode = type; opcode = aiocbe->uaiocb.aio_lio_opcode; /* Fetch the file object for the specified file descriptor. */ fd = aiocbe->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: error = fget_write(td, fd, &fp); break; case LIO_READ: error = fget_read(td, fd, &fp); break; default: error = fget(td, fd, &fp); } if (error) { uma_zfree(aiocb_zone, aiocbe); suword(&job->_aiocb_private.error, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) { error = EINVAL; goto aqueue_fail; } aiocbe->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; aiocbe->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = suword(&job->_aiocb_private.kernelinfo, jid); if (error) { error = EINVAL; goto aqueue_fail; } aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); return (0); } if ((opcode != LIO_READ) && (opcode != LIO_WRITE) && (opcode != LIO_SYNC)) { error = EINVAL; goto aqueue_fail; } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; kev.ident = (uintptr_t)aiocbe->uuaiocb; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.data = (intptr_t)aiocbe; kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, 1); aqueue_fail: if (error) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); suword(&job->_aiocb_private.error, error); goto done; } no_kqueue: suword(&job->_aiocb_private.error, EINPROGRESS); aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; aiocbe->userproc = p; aiocbe->cred = crhold(td->td_ucred); aiocbe->jobflags = 0; aiocbe->lio = lj; if (opcode == LIO_SYNC) goto queueit; if (fp->f_type == DTYPE_SOCKET) { /* * Alternate queueing for socket ops: Reach down into the * descriptor to get the socket data. Then check to see if the * socket is ready to be read or written (based on the requested * operation). * * If it is not ready for io, then queue the aiocbe on the * socket, and set the flags so we get a call when sbnotify() * happens. * * Note if opcode is neither LIO_WRITE nor LIO_READ we lock * and unlock the snd sockbuf for no reason. */ so = fp->f_data; sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; SOCKBUF_LOCK(sb); if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == LIO_WRITE) && (!sowriteable(so)))) { sb->sb_flags |= SB_AIO; mtx_lock(&aio_job_mtx); TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); mtx_unlock(&aio_job_mtx); AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); aiocbe->jobstate = JOBST_JOBQSOCK; ki->kaio_count++; if (lj) lj->lioj_count++; AIO_UNLOCK(ki); SOCKBUF_UNLOCK(sb); atomic_add_int(&num_queue_count, 1); error = 0; goto done; } SOCKBUF_UNLOCK(sb); } if ((error = aio_qphysio(p, aiocbe)) == 0) goto done; #if 0 if (error > 0) { aiocbe->uaiocb._aiocb_private.error = error; suword(&job->_aiocb_private.error, error); goto done; } #endif queueit: /* No buffer for daemon I/O. */ aiocbe->bp = NULL; atomic_add_int(&num_queue_count, 1); AIO_LOCK(ki); ki->kaio_count++; if (lj) lj->lioj_count++; TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); if (opcode == LIO_SYNC) { TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) { if (cb->fd_file == aiocbe->fd_file && cb->uaiocb.aio_lio_opcode != LIO_SYNC && cb->seqno < aiocbe->seqno) { cb->jobflags |= AIOCBLIST_CHECKSYNC; aiocbe->pending++; } } TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) { if (cb->fd_file == aiocbe->fd_file && cb->uaiocb.aio_lio_opcode != LIO_SYNC && cb->seqno < aiocbe->seqno) { cb->jobflags |= AIOCBLIST_CHECKSYNC; aiocbe->pending++; } } if (aiocbe->pending != 0) { TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list); aiocbe->jobstate = JOBST_JOBQSYNC; AIO_UNLOCK(ki); goto done; } } mtx_lock(&aio_job_mtx); TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); aiocbe->jobstate = JOBST_JOBQGLOBAL; aio_kick_nowait(p); mtx_unlock(&aio_job_mtx); AIO_UNLOCK(ki); error = 0; done: return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aiothreadlist *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aiothreadlist *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ int aio_return(struct thread *td, struct aio_return_args *uap) { struct proc *p = td->td_proc; struct aiocblist *cb; struct aiocb *uaiocb; struct kaioinfo *ki; int status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); uaiocb = uap->aiocbp; AIO_LOCK(ki); TAILQ_FOREACH(cb, &ki->kaio_done, plist) { if (cb->uuaiocb == uaiocb) break; } if (cb != NULL) { MPASS(cb->jobstate == JOBST_JOBFINISHED); status = cb->uaiocb._aiocb_private.status; error = cb->uaiocb._aiocb_private.error; td->td_retval[0] = status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { td->td_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { td->td_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); AIO_UNLOCK(ki); suword(&uaiocb->_aiocb_private.error, error); suword(&uaiocb->_aiocb_private.status, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ int aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct proc *p = td->td_proc; struct timeval atv; struct timespec ts; struct aiocb *const *cbptr, *cbp; struct kaioinfo *ki; struct aiocblist *cb, *cbfirst; struct aiocb **ujoblist; int njoblist; int error; int timo; int i; if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) return (EINVAL); timo = 0; if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); njoblist = 0; ujoblist = uma_zalloc(aiol_zone, M_WAITOK); cbptr = uap->aiocbp; for (i = 0; i < uap->nent; i++) { cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); if (cbp == 0) continue; ujoblist[njoblist] = cbp; njoblist++; } if (njoblist == 0) { uma_zfree(aiol_zone, ujoblist); return (0); } AIO_LOCK(ki); for (;;) { cbfirst = NULL; error = 0; TAILQ_FOREACH(cb, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (cb->uuaiocb == ujoblist[i]) { if (cbfirst == NULL) cbfirst = cb; if (cb->jobstate == JOBST_JOBFINISHED) goto RETURN; } } } /* All tasks were finished. */ if (cbfirst == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); uma_zfree(aiol_zone, ujoblist); return (error); } /* * aio_cancel cancels any non-physio aio operations not currently in * progress. */ int aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct aiocblist *cbe, *cbn; struct file *fp; struct socket *so; int error; int remove; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp, &error)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { if ((uap->fd == cbe->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == cbe->uuaiocb))) { remove = 0; mtx_lock(&aio_job_mtx); if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSOCK) { MPASS(fp->f_type == DTYPE_SOCKET); so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSYNC) { TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); remove = 1; } mtx_unlock(&aio_job_mtx); if (remove) { TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; aio_bio_done_notify(p, cbe, DONE_QUEUE); cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ int aio_error(struct thread *td, struct aio_error_args *uap) { struct proc *p = td->td_proc; struct aiocblist *cb; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(cb, &ki->kaio_all, allist) { if (cb->uuaiocb == uap->aiocbp) { if (cb->jobstate == JOBST_JOBFINISHED) td->td_retval[0] = cb->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = fuword(&uap->aiocbp->_aiocb_private.status); if (status == -1) { td->td_retval[0] = fuword(&uap->aiocbp->_aiocb_private.error); return (0); } td->td_retval[0] = EINVAL; return (0); } /* syscall - asynchronous read from a file (REALTIME) */ int oaio_read(struct thread *td, struct oaio_read_args *uap) { return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 1); } int aio_read(struct thread *td, struct aio_read_args *uap) { return aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, 0); } /* syscall - asynchronous write to a file (REALTIME) */ int oaio_write(struct thread *td, struct oaio_write_args *uap) { return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 1); } int aio_write(struct thread *td, struct aio_write_args *uap) { return aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, 0); } /* syscall - list directed I/O (REALTIME) */ int olio_listio(struct thread *td, struct olio_listio_args *uap) { return do_lio_listio(td, (struct lio_listio_args *)uap, 1); } /* syscall - list directed I/O (REALTIME) */ int lio_listio(struct thread *td, struct lio_listio_args *uap) { return do_lio_listio(td, uap, 0); } static int do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev) { struct proc *p = td->td_proc; struct aiocb *iocb, * const *cbptr; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int nent; int error; int nerror; int i; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init(&lj->klist, AIO_MTX(ki), NULL, NULL, NULL); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (uap->sig && (uap->mode == LIO_NOWAIT)) { bzero(&lj->lioj_signal, sizeof(&lj->lioj_signal)); error = copyin(uap->sig, &lj->lioj_signal, oldsigev ? sizeof(struct osigevent) : sizeof(struct sigevent)); if (error) { uma_zfree(aiolio_zone, lj); return (error); } if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uap->acb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nerror = 0; cbptr = uap->acb_list; for (i = 0; i < uap->nent; i++) { iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { error = aio_aqueue(td, iocb, lj, LIO_NOP, oldsigev); if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (uap->mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); return (error); } /* * Called from interrupt thread for physio, we should return as fast * as possible, so we schedule a biohelper task. */ static void aio_physwakeup(struct buf *bp) { struct aiocblist *aiocbe; aiocbe = (struct aiocblist *)bp->b_caller1; taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask); } /* * Task routine to perform heavy tasks, process wakeup, and signals. */ static void biohelper(void *context, int pending) { struct aiocblist *aiocbe = context; struct buf *bp; struct proc *userp; struct kaioinfo *ki; int nblks; bp = aiocbe->bp; userp = aiocbe->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; aiocbe->uaiocb._aiocb_private.error = 0; if (bp->b_ioflags & BIO_ERROR) aiocbe->uaiocb._aiocb_private.error = bp->b_error; nblks = btodb(aiocbe->uaiocb.aio_nbytes); if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) aiocbe->outputcharge += nblks; else aiocbe->inputcharge += nblks; aiocbe->bp = NULL; TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); ki->kaio_buffer_count--; aio_bio_done_notify(userp, aiocbe, DONE_BUF); AIO_UNLOCK(ki); /* Release mapping into kernel space. */ vunmapbuf(bp); relpbuf(bp, NULL); atomic_subtract_int(&num_buf_aio, 1); } /* syscall - wait for the next completion of an aio request */ int aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct proc *p = td->td_proc; struct timeval atv; struct timespec ts; struct kaioinfo *ki; struct aiocblist *cb; struct aiocb *uuaiocb; int error, status, timo; suword(uap->aiocbp, (long)NULL); timo = 0; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; cb = NULL; AIO_LOCK(ki); while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (cb != NULL) { MPASS(cb->jobstate == JOBST_JOBFINISHED); uuaiocb = cb->uuaiocb; status = cb->uaiocb._aiocb_private.status; error = cb->uaiocb._aiocb_private.error; td->td_retval[0] = status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { td->td_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { td->td_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); AIO_UNLOCK(ki); suword(uap->aiocbp, (long)uuaiocb); suword(&uuaiocb->_aiocb_private.error, error); suword(&uuaiocb->_aiocb_private.status, status); } else AIO_UNLOCK(ki); return (error); } int aio_fsync(struct thread *td, struct aio_fsync_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; if (uap->op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); ki = p->p_aioinfo; if (ki == NULL) aio_init_aioinfo(p); return aio_aqueue(td, uap->aiocbp, NULL, LIO_SYNC, 0); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; /* * The aiocbe pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_flags &= ~EV_FLAG1; knlist_add(&aiocbe->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; if (!knlist_empty(&aiocbe->klist)) knlist_remove(&aiocbe->klist, kn, 0); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; kn->kn_data = aiocbe->uaiocb._aiocb_private.error; if (aiocbe->jobstate != JOBST_JOBFINISHED) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; if (!knlist_empty(&lj->klist)) knlist_remove(&lj->klist, kn, 0); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 175201) +++ head/sys/kern/vfs_bio.c (revision 175202) @@ -1,3953 +1,3953 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; /* * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ static struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages(struct buf *bp); static void vfs_setdirty(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int flushbufqueues(int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); int runningbufspace; SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static int bufspace; SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "KVA memory used for bufs"); static int maxbufspace; SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); static int bufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static int maxbufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static int lobufspace; SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); int hibufspace; SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, "Number of times we have reused a buffer"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static int lorunningspace; SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, "Minimum preferred space used for in-progress I/O"); static int hirunningspace; SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "XXX Unused"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "XXX Complicatedly unused"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * This lock synchronizes access to bd_request. */ static struct mtx bdlock; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx rbreqlock; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static int needsbuffer; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct mtx nblock; /* * Lock that protects against bwait()/bdone()/B_DONE races. */ static struct mtx bdonelock; /* * Lock that protects against bwait()/bdone()/B_DONE races. */ static struct mtx bpinlock; /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */ #define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 5 /* empty buffer headers */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; /* Lock for the bufqueues */ static struct mtx bqlock; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #ifdef DIRECTIO extern void ffs_rawread_setup(void); #endif /* DIRECTIO */ /* * numdirtywakeup: * * If someone is blocked due to there being too many dirty buffers, * and numdirtybuffers is now reasonable, wake them up. */ static __inline void numdirtywakeup(int level) { if (numdirtybuffers <= level) { mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } mtx_unlock(&nblock); } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * runningbufwakeup() - in-progress I/O accounting. * */ void runningbufwakeup(struct buf *bp) { if (bp->b_runningbufspace) { atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); bp->b_runningbufspace = 0; mtx_lock(&rbreqlock); if (runningbufreq && runningbufspace <= lorunningspace) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { atomic_add_int(&numfreebuffers, 1); mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * Reads will adjust runningbufspace, but will not block based on it. * The read load has a side effect of reducing the allowed write load. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(int dirtybuflevel) { mtx_lock(&bdlock); if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ static __inline void bd_speedup(void) { bd_wakeup(1); } /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int maxbuf; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += (physmem_est - 65536) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; /* XXX Avoid integer overflows later on with maxbufspace. */ maxbuf = (INT_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) nbuf = maxbuf; } #if 0 /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } #endif /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = max(min(nbuf/4, 256), 16); #ifdef NSWBUF_MIN if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; #endif #ifdef DIRECTIO ffs_rawread_setup(); #endif /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vflags = 0; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = nbuf * BKVASIZE; hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; lorunningspace = 512 * 1024; hirunningspace = 1024 * 1024; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd (8K) buffers. */ while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; /* * Maximum number of async ops initiated per buf_daemon loop. This is * somewhat of a hack at the moment, we really need to limit ourselves * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * bfreekva() - free the kva allocation for a buffer. * * Since this call frees up buffer space, we call bufspacewakeup(). */ static void bfreekva(struct buf *bp) { if (bp->b_kvasize) { atomic_add_int(&buffreekvacnt, 1); atomic_subtract_int(&bufspace, bp->b_kvasize); vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize); bp->b_kvasize = 0; bufspacewakeup(); } } /* * bremfree: * * Mark the buffer for removal from the appropriate free list in brelse. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(BUF_REFCNT(bp), ("bremfree: buf must be locked.")); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); bp->b_flags |= B_REMFREE; /* Fixup numfreebuffers count. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) atomic_subtract_int(&numfreebuffers, 1); } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { mtx_lock(&bqlock); bremfreel(bp); mtx_unlock(&bqlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * bqlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(BUF_REFCNT(bp), ("bremfreel: buffer %p not locked.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); mtx_assert(&bqlock, MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; /* * If this was a delayed bremfree() we only need to remove the buffer * from the queue and return the stats are already done. */ if (bp->b_flags & B_REMFREE) { bp->b_flags &= ~B_REMFREE; return; } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, the buffer was free and we must decrement * numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) atomic_subtract_int(&numfreebuffers, 1); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). This is really just a special case of breadn(). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf **bpp) { return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); *bpp = bp = getblk(vp, blkno, size, 0, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); if (bp->b_pin_count > 0) bunpin_wait(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* Mark the buffer clean */ bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; bufobj_wref(bp->b_bufobj); vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(BUF_REFCNT(bp) != 0, ("bdwrite: buffer is not busy")); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * Wakeup the buffer flushing daemon if we have a lot of dirty * buffers (midpoint between our recovery point and our stall * point). */ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(BUF_REFCNT(bp) == 1, ("bdirty: bp %p not locked",bp)); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); atomic_add_int(&numdirtybuffers, 1); bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); KASSERT(BUF_REFCNT(bp) == 1, ("bundirty: bp %p not locked",bp)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&nblock); while (numdirtybuffers >= hidirtybuffers) { bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; msleep(&needsbuffer, &nblock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&nblock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && bp->b_error == EIO && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If the error is anything * other than an I/O error (EIO), assume that retryingi * is futile. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. * * If B_DELWRI is not set we may have to set B_RELBUF if we are low * on pages to return pages to the VM page queues. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; else if (vm_page_count_severe()) { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ if (bp->b_vp) { BO_LOCK(bp->b_bufobj); if (!(bp->b_vflags & BV_BKGRDINPROG)) bp->b_flags |= B_RELBUF; BO_UNLOCK(bp->b_bufobj); } else bp->b_flags |= B_RELBUF; } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; obj = bp->b_bufobj->bo_object; /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; VM_OBJECT_LOCK(obj); for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_lock_queues(); vm_page_set_invalid(m, poffset, presid); vm_page_unlock_queues(); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) { vfs_vmio_release(bp); } } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { if (bp->b_bufsize != 0) allocbuf(bp, 0); if (bp->b_vp != NULL) brelvp(bp); } if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); return; } /* enqueue */ mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { bp->b_qindex = QUEUE_EMPTYKVA; } else { bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); /* remaining buffers */ } else { if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) == (B_DELWRI|B_NEEDSGIANT)) bp->b_qindex = QUEUE_DIRTY_GIANT; if (bp->b_flags & B_DELWRI) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } mtx_unlock(&bqlock); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already * placed the buffer on the correct queue. We must also disassociate * the device and vnode for a B_INVAL buffer so gbincore() doesn't * find it. */ if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if (!(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse */ if (bp->b_bufsize || bp->b_kvasize) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) { mtx_lock(&bqlock); bremfreel(bp); mtx_unlock(&bqlock); } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); BUF_UNLOCK(bp); return; } mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); /* buffers with stale but valid contents */ if (bp->b_flags & B_DELWRI) { if (bp->b_flags & B_NEEDSGIANT) bp->b_qindex = QUEUE_DIRTY_GIANT; else bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } else { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ BO_LOCK(bp->b_bufobj); if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) { BO_UNLOCK(bp->b_bufobj); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } else { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ BO_UNLOCK(bp->b_bufobj); mtx_unlock(&bqlock); brelse(bp); return; } } mtx_unlock(&bqlock); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); } /* Give pages used by the bp back to the VM system (where possible) */ static void vfs_vmio_release(struct buf *bp) { int i; vm_page_t m; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->oflags & VPO_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { /* * Might as well free the page if we can and it has * no valid data. We also free the page if the * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_free(m); } else if (bp->b_flags & B_DIRECT) { vm_page_try_to_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) { bufspacewakeup(); bp->b_bufsize = 0; } bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; VI_LOCK(vp); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; VI_UNLOCK(vp); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); return nwritten; } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; int defrag = 0; int nqindex; static int flushingbufs; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: atomic_add_int(&getnewbufrestarts, 1); /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ mtx_lock(&bqlock); nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp == NULL) { /* * If no EMPTYKVA buffers and we are either * defragging or reusing, locate a CLEAN buffer * to free or reuse. If bufspace useage is low * skip this step so we can allocate a new buffer. */ if (defrag || bufspace >= lobufspace) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * If we could not find or were not allowed to reuse a * CLEAN buffer, check to see if it is ok to use an EMPTY * buffer. We can only use an EMPTY buffer if allocating * its KVA would not otherwise run us out of buffer space. */ if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { int qindex = nqindex; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* FALLTHROUGH */ case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer * occur, if defrag is non-zero the buffer's b_kvasize * should also be non-zero at this point. XXX */ if (defrag && bp->b_kvasize == 0) { printf("Warning: defrag empty buffer %p\n", bp); continue; } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; if (bp->b_vp) { BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { BO_UNLOCK(bp->b_bufobj); BUF_UNLOCK(bp); continue; } BO_UNLOCK(bp->b_bufobj); } CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, bp->b_kvasize, bp->b_bufsize, qindex); /* * Sanity Checks */ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); bremfreel(bp); mtx_unlock(&bqlock); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", bp, bp->b_vp, qindex)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_pin_count = 0; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); defrag = 0; goto restart; } /* * Notify any waiters for the buffer lock about * identity change by freeing the buffer. */ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (bufspace < lobufspace) flushingbufs = 0; break; } /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { int flags; char *waitmsg; if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } mtx_lock(&nblock); needsbuffer |= flags; mtx_unlock(&nblock); mtx_unlock(&bqlock); bd_speedup(); /* heeeelp */ mtx_lock(&nblock); while (needsbuffer & flags) { if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { mtx_unlock(&nblock); return (NULL); } } mtx_unlock(&nblock); } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order * to keep fragmentation sane we only allocate kva in * BKVASIZE chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize) { vm_offset_t addr = 0; bfreekva(bp); vm_map_lock(buffer_map); if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ atomic_add_int(&bufdefragcnt, 1); vm_map_unlock(buffer_map); defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; atomic_add_int(&bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } vm_map_unlock(buffer_map); } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; } return(bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) static void buf_daemon() { /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. Wakeup any waiting processes before we * normally would so they can run in parallel with our drain. */ while (numdirtybuffers > lodirtybuffers) { int flushed; flushed = flushbufqueues(QUEUE_DIRTY, 0); /* The list empty check here is slightly racy */ if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) { mtx_lock(&Giant); flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0); mtx_unlock(&Giant); } if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ flushbufqueues(QUEUE_DIRTY, 1); if (!TAILQ_EMPTY( &bufqueues[QUEUE_DIRTY_GIANT])) { mtx_lock(&Giant); flushbufqueues(QUEUE_DIRTY_GIANT, 1); mtx_unlock(&Giant); } break; } uio_yield(); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep half a second. * Otherwise we loop immediately. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(int queue, int flushdeps) { struct thread *td = curthread; struct buf sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int target; target = numdirtybuffers - lodirtybuffers; if (flushdeps && target > 2) target /= 2; flushed = 0; bp = NULL; mtx_lock(&bqlock); TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist); while (flushed != target) { bp = TAILQ_FIRST(&bufqueues[queue]); if (bp == &sentinel) break; TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist); TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; if (bp->b_pin_count > 0) { BUF_UNLOCK(bp); continue; } BO_LOCK(bp->b_bufobj); if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BO_UNLOCK(bp->b_bufobj); BUF_UNLOCK(bp); continue; } BO_UNLOCK(bp->b_bufobj); if (bp->b_flags & B_INVAL) { bremfreel(bp); mtx_unlock(&bqlock); brelse(bp); flushed++; numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); mtx_lock(&bqlock); continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } - if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { + if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { mtx_unlock(&bqlock); CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); vfs_bio_awrite(bp); vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); flushwithdeps += hasdeps; flushed++; waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); mtx_lock(&bqlock); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist); mtx_unlock(&bqlock); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_LOCK(bo); bp = gbincore(bo, blkno); BO_UNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_LOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_UNLOCK(obj); return 1; notinmem: VM_OBJECT_UNLOCK(obj); return (0); } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((bp->b_flags & B_VMIO) == 0) return; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vfs_setdirty_locked_object(bp); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; vm_page_lock_queues(); /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); vm_page_unlock_queues(); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int error; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); bo = &vp->v_bufobj; loop: /* * Block if we are low on buffers. Certain processes are allowed * to completely exhaust the buffer cache. * * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. * * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (TD_IS_IDLETHREAD(curthread)) return NULL; mtx_lock(&nblock); needsbuffer |= VFS_BIO_NEED_ANY; mtx_unlock(&nblock); } BO_LOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, VI_MTX(vp), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { /* * If buffer is pinned and caller does * not want sleep waiting for it to be * unpinned, bail out * */ if (bp->b_pin_count > 0) { if (flags & GB_LOCK_NOWAIT) { bqrelse(bp); return (NULL); } else { bunpin_wait(bp); } } bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { int bsize, maxsize, vmio; off_t offset; /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_UNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; bsize = bo->bo_bsize; offset = blkno * bsize; vmio = vp->v_object != NULL; maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); bp = getnewbuf(slpflag, slptimeo, size, maxsize); if (bp == NULL) { if (slpflag || slptimeo) return NULL; goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vn_canvmio(vp) != TRUE) printf("getblk: VMIO on vnode type %d\n", vp->v_type); #endif KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); } allocbuf(bp, size); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) continue; allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); if (bp->b_flags & B_MALLOC) newbsize = mbsize; else newbsize = round_page(size); if (newbsize < bp->b_bufsize) { /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { atomic_subtract_int( &bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ /* * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; atomic_add_int(&bufmallocspace, mbsize); return 1; } origbuf = NULL; origbufsize = 0; /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { atomic_subtract_int(&bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } } } else { int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { vm_page_t m; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_if_busy(m, TRUE, "biodep")) vm_page_lock_queues(); bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; obj = bp->b_bufobj->bo_object; VM_OBJECT_LOCK(obj); while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { /* * note: must allocate system pages * since blocking here could intefere * with paging I/O, no matter which * process we are. */ m = vm_page_alloc(obj, pi, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (m == NULL) { atomic_add_int(&vm_pageout_deficit, desiredpages - bp->b_npages); VM_OBJECT_UNLOCK(obj); VM_WAIT; VM_OBJECT_LOCK(obj); } else { if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test VPO_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. */ vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_UNLOCK(obj); /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } void biodone(struct bio *bp) { void (*done)(struct bio *); mtx_lock(&bdonelock); bp->bio_flags |= BIO_DONE; done = bp->bio_done; if (done == NULL) wakeup(bp); mtx_unlock(&bdonelock); if (done != NULL) done(bp); } /* * Wait for a BIO to finish. * * XXX: resort to a timeout for now. The optimal locking (if any) for this * case is not yet clear. */ int biowait(struct bio *bp, const char *wchan) { mtx_lock(&bdonelock); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10); mtx_unlock(&bdonelock); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. */ static void bufdonebio(struct bio *bip) { struct buf *bp; bp = bip->bio_caller2; bp->b_resid = bp->b_bcount - bip->bio_completed; bp->b_resid = bip->bio_resid; /* XXX: remove */ bp->b_ioflags = bip->bio_flags; bp->b_error = bip->bio_error; if (bp->b_error) bp->b_ioflags |= BIO_ERROR; bufdone(bp); g_destroy_bio(bip); } void dev_strategy(struct cdev *dev, struct buf *bp) { struct cdevsw *csw; struct bio *bip; if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1))) panic("b_iocmd botch"); for (;;) { bip = g_new_bio(); if (bip != NULL) break; /* Try again later */ tsleep(&bp, PRIBIO, "dev_strat", hz/10); } bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bip->bio_bcount = bp->b_bcount; /* XXX: remove */ bip->bio_data = bp->b_data; bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; KASSERT(dev->si_refcount > 0, ("dev_strategy on un-referenced struct cdev *(%s)", devtoname(dev))); csw = dev_refthread(dev); if (csw == NULL) { g_destroy_bio(bip); bp->b_error = ENXIO; bp->b_ioflags = BIO_ERROR; bufdone(bp); return; } (*csw->d_strategy)(bip); dev_relthread(dev); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { int i; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; boolean_t are_queues_locked; obj = bp->b_bufobj->bo_object; #if defined(VFS_BIO_DEBUG) mp_fixme("usecount and vflag accessed without locks."); if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } KASSERT(vp->v_object != NULL, ("biodone: vnode %p has no vm_object", vp)); #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); VM_OBJECT_LOCK(obj); #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } if (bp->b_iocmd == BIO_READ) { vm_page_lock_queues(); are_queues_locked = TRUE; } else are_queues_locked = FALSE; for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%jd)/m->pindex(%ju) mismatch\n", (intmax_t)foff, (uintmax_t)m->pindex); } #endif /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, m); } /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); if (!vn_isdisk(vp, NULL)) printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize, (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n", (u_long)m->valid, (u_long)m->dirty, m->wire_count); panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } if (are_queues_locked) vm_page_unlock_queues(); vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_LOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being VPO_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_LOCK(obj); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); retry: for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_sleep_if_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ pmap_remove_all(m); if (clear_modify) vfs_page_set_valid(bp, foff, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf *bp) { int i; vm_ooffset_t foff, noff, eoff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. * */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask = 0; caddr_t sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) == 0)) { bzero(bp->b_data, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } ea = sa = bp->b_data; for(i = 0; i < bp->b_npages; i++, sa = ea) { ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)(vm_offset_t)ulmin( (u_long)(vm_offset_t)ea, (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); if (bp->b_pages[i] == bogus_page) continue; j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) bzero(sa, ea - sa); } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1 << j)) == 0) bzero(sa, DEV_BSIZE); } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could intefere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!p) { atomic_add_int(&vm_pageout_deficit, (to - pg) >> PAGE_SHIFT); VM_OBJECT_UNLOCK(kernel_object); VM_WAIT; VM_OBJECT_LOCK(kernel_object); goto tryagain; } p->valid = VM_PAGE_BITS_ALL; pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { if (p->busy) { printf( "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); } bp->b_pages[index] = NULL; pmap_qremove(pg, 1); vm_page_lock_queues(); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. */ int vmapbuf(struct buf *bp) { caddr_t addr, kva; vm_prot_t prot; int pidx, i; struct vm_page *m; struct pmap *pmap = &curproc->p_vmspace->vm_pmap; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0; addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE, pidx++) { /* * Do the vm_fault if needed; do the copy-on-write thing * when reading stuff off device into memory. * * NOTE! Must use pmap_extract() because addr may be in * the userland address space, and kextract is only guarenteed * to work for the kernland address space (see: sparc64 port). */ retry: if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, prot) < 0) { vm_page_lock_queues(); for (i = 0; i < pidx; ++i) { vm_page_unhold(bp->b_pages[i]); bp->b_pages[i] = NULL; } vm_page_unlock_queues(); return(-1); } m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot); if (m == NULL) goto retry; bp->b_pages[pidx] = m; } if (pidx > btoc(MAXPHYS)) panic("vmapbuf: mapped more than MAXPHYS"); pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); kva = bp->b_saveaddr; bp->b_npages = pidx; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(struct buf *bp) { int pidx; int npages; npages = bp->b_npages; pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_lock_queues(); for (pidx = 0; pidx < npages; pidx++) vm_page_unhold(bp->b_pages[pidx]); vm_page_unlock_queues(); bp->b_data = bp->b_saveaddr; } void bdone(struct buf *bp) { mtx_lock(&bdonelock); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(&bdonelock); } void bwait(struct buf *bp, u_char pri, const char *wchan) { mtx_lock(&bdonelock); while ((bp->b_flags & B_DONE) == 0) msleep(bp, &bdonelock, pri, wchan, 0); mtx_unlock(&bdonelock); } int bufsync(struct bufobj *bo, int waitfor, struct thread *td) { return (VOP_FSYNC(bo->__bo_vnode, waitfor, td)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_LOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_LOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_MTX(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } void bpin(struct buf *bp) { mtx_lock(&bpinlock); bp->b_pin_count++; mtx_unlock(&bpinlock); } void bunpin(struct buf *bp) { mtx_lock(&bpinlock); if (--bp->b_pin_count == 0) wakeup(bp); mtx_unlock(&bpinlock); } void bunpin_wait(struct buf *bp) { mtx_lock(&bpinlock); while (bp->b_pin_count > 0) msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0); mtx_unlock(&bpinlock); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } lockmgr_printinfo(&bp->b_lock); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (lockcount(&bp->b_lock)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } #endif /* DDB */ Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 175201) +++ head/sys/kern/vfs_cache.c (revision 175202) @@ -1,842 +1,842 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name */ }; /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. */ /* * Structures associated with name cacheing. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, ""); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, ""); static u_long numneg; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, ""); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, ""); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, ""); #if 0 static u_long numcachepl; /* number of cache purge for leaf entries */ SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, ""); #endif struct nchstats nchstats; /* cache effectiveness statistics */ static struct mtx cache_lock; MTX_SYSINIT(vfscache, &cache_lock, "Name Cache", MTX_DEF); #define CACHE_LOCK() mtx_lock(&cache_lock) #define CACHE_UNLOCK() mtx_unlock(&cache_lock) /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_large; #define CACHE_PATH_CUTOFF 32 #define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF) #define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX) #define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \ cache_zone_small : cache_zone_large, M_WAITOK) #define cache_free(ncp) do { \ if (ncp != NULL) \ uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \ cache_zone_small : cache_zone_large, (ncp)); \ } while (0) static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0, sizeof(struct namecache), ""); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE(mode, name, var) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); STATNODE(CTLFLAG_RD, numneg, &numneg); STATNODE(CTLFLAG_RD, numcache, &numcache); static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls); static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits); static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits); static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks); static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss); static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap); static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps); static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits); static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps); static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits); SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); static void cache_zap(struct namecache *ncp); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); /* * Flags in namecache.nc_flag */ #define NCF_WHITE 1 /* * Grab an atomic snapshot of the name cache hash chain lengths */ SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count; n_nchash = nchash + 1; /* nchash is max index, not count */ if (!req->oldptr) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } error = SYSCTL_OUT(req, &count, sizeof(count)); if (error) return (error); } return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; pct = (used * 100 * 100) / n_nchash; error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths"); /* * cache_zap(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap(ncp) struct namecache *ncp; { struct vnode *vp; mtx_assert(&cache_lock, MA_OWNED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); vp = NULL; LIST_REMOVE(ncp, nc_hash); LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { vp = ncp->nc_dvp; numcachehv--; } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); ncp->nc_vp->v_dd = NULL; } else { TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; } numcache--; cache_free(ncp); if (vp) vdrop(vp); } /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(dvp, vpp, cnp) struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { struct namecache *ncp; struct thread *td; u_int32_t hash; int error, ltype; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } td = cnp->cn_thread; retry: CACHE_LOCK(); numcalls++; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); dothits++; goto success; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { dotdothits++; if (dvp->v_dd == NULL || (cnp->cn_flags & MAKEENTRY) == 0) { CACHE_UNLOCK(); return (0); } *vpp = dvp->v_dd; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); goto success; } } hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { numchecks++; if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == 0) { if ((cnp->cn_flags & MAKEENTRY) == 0) { nummisszap++; } else { nummiss++; } nchstats.ncs_miss++; CACHE_UNLOCK(); return (0); } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { numposzaps++; nchstats.ncs_badhits++; cache_zap(ncp); CACHE_UNLOCK(); return (0); } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { numposhits++; nchstats.ncs_goodhits++; *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); goto success; } /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { numnegzaps++; nchstats.ncs_badhits++; cache_zap(ncp); CACHE_UNLOCK(); return (0); } numneghits++; /* * We found a "negative" match, so we shift it to the end of * the "negative" cache entries queue to satisfy LRU. Also, * check to see if the entry is a whiteout; indicate this to * the componentname, if so. */ TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); nchstats.ncs_neghits++; if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; CACHE_UNLOCK(); return (ENOENT); success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ if (dvp == *vpp) { /* lookup on "." */ VREF(*vpp); CACHE_UNLOCK(); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE); if (ltype == VOP_ISLOCKED(*vpp, td)) return (-1); else if (ltype == LK_EXCLUSIVE) - vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td); + vn_lock(*vpp, LK_UPGRADE | LK_RETRY); return (-1); } ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp, td); VOP_UNLOCK(dvp, 0, td); } VI_LOCK(*vpp); CACHE_UNLOCK(); error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td); if (cnp->cn_flags & ISDOTDOT) - vn_lock(dvp, ltype | LK_RETRY, td); + vn_lock(dvp, ltype | LK_RETRY); if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE)) ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); if (error) { *vpp = NULL; goto retry; } return (-1); } /* * Add an entry to the cache. */ void cache_enter(dvp, vp, cnp) struct vnode *dvp; struct vnode *vp; struct componentname *cnp; { struct namecache *ncp; struct nchashhead *ncpp; u_int32_t hash; int hold; int zap; int len; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cahe_enter: Adding a doomed vnode")); if (!doingcache) return; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { return; } /* * For dotdot lookups only cache the v_dd pointer if the * directory has a link back to its parent via v_cache_dst. * Without this an unlinked directory would keep a soft * reference to its parent which could not be NULLd at * cache_purge() time. */ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { CACHE_LOCK(); if (!TAILQ_EMPTY(&dvp->v_cache_dst)) dvp->v_dd = vp; CACHE_UNLOCK(); return; } } hold = 0; zap = 0; ncp = cache_alloc(cnp->cn_namelen); CACHE_LOCK(); numcache++; if (!vp) { numneg++; ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0; } else if (vp->v_type == VDIR) { vp->v_dd = dvp; } else { vp->v_dd = NULL; } /* * Set the rest of the namecache entry elements, calculate it's * hash key and insert it into the appropriate chain within * the cache entries table. */ ncp->nc_vp = vp; ncp->nc_dvp = dvp; len = ncp->nc_nlen = cnp->cn_namelen; hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT); bcopy(cnp->cn_nameptr, ncp->nc_name, len); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); ncpp = NCHHASH(hash); LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (LIST_EMPTY(&dvp->v_cache_src)) { hold = 1; numcachehv++; } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); } else { TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); } if (numneg * ncnegfactor > numcache) { ncp = TAILQ_FIRST(&ncneg); zap = 1; } if (hold) vhold(dvp); if (zap) cache_zap(ncp); CACHE_UNLOCK(); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { TAILQ_INIT(&ncneg); cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL) /* * Invalidate all entries to a particular vnode. */ void cache_purge(vp) struct vnode *vp; { CTR1(KTR_VFS, "cache_purge(%p)", vp); CACHE_LOCK(); while (!LIST_EMPTY(&vp->v_cache_src)) cache_zap(LIST_FIRST(&vp->v_cache_src)); while (!TAILQ_EMPTY(&vp->v_cache_dst)) cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); vp->v_dd = NULL; CACHE_UNLOCK(); } /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; struct namecache *ncp, *nnp; /* Scan hash tables for applicable entries */ CACHE_LOCK(); for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) { if (ncp->nc_dvp->v_mount == mp) cache_zap(ncp); } } CACHE_UNLOCK(); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == ENOENT) return (error); return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getcwd_args { u_char *buf; u_int buflen; }; #endif /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); /* Implementation of the getcwd syscall. */ int __getcwd(td, uap) struct thread *td; struct __getcwd_args *uap; { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen)); } int kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen) { char *bp, *tmpbuf; struct filedesc *fdp; int error; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); error = vn_fullpath1(td, fdp->fd_cdir, fdp->fd_rdir, tmpbuf, &bp, buflen); FILEDESC_SUNLOCK(fdp); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ #undef STATNODE #define STATNODE(name) \ static u_int name; \ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "") static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* These count for kern___getcwd(), too. */ STATNODE(numfullpathcalls); STATNODE(numfullpathfail1); STATNODE(numfullpathfail2); STATNODE(numfullpathfail4); STATNODE(numfullpathfound); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); error = vn_fullpath1(td, vn, fdp->fd_rdir, buf, retbuf, MAXPATHLEN); FILEDESC_SUNLOCK(fdp); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { char *bp; int error, i, slash_prefixed; struct namecache *ncp; bp = buf + buflen - 1; *bp = '\0'; error = 0; slash_prefixed = 0; CACHE_LOCK(); numfullpathcalls++; if (vp->v_type != VDIR) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!ncp) { numfullpathfail2++; CACHE_UNLOCK(); return (ENOENT); } for (i = ncp->nc_nlen - 1; i >= 0 && bp > buf; i--) *--bp = ncp->nc_name[i]; if (bp == buf) { numfullpathfail4++; CACHE_UNLOCK(); return (ENOMEM); } *--bp = '/'; slash_prefixed = 1; vp = ncp->nc_dvp; } while (vp != rdir && vp != rootvnode) { if (vp->v_vflag & VV_ROOT) { if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ error = EBADF; break; } vp = vp->v_mount->mnt_vnodecovered; continue; } if (vp->v_dd == NULL) { numfullpathfail1++; error = ENOTDIR; break; } ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!ncp) { numfullpathfail2++; error = ENOENT; break; } MPASS(ncp->nc_dvp == vp->v_dd); for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--) *--bp = ncp->nc_name[i]; if (bp == buf) { numfullpathfail4++; error = ENOMEM; break; } *--bp = '/'; slash_prefixed = 1; vp = ncp->nc_dvp; } if (error) { CACHE_UNLOCK(); return (error); } if (!slash_prefixed) { if (bp == buf) { numfullpathfail4++; CACHE_UNLOCK(); return (ENOMEM); } else { *--bp = '/'; } } numfullpathfound++; CACHE_UNLOCK(); *retbuf = bp; return (0); } Index: head/sys/kern/vfs_extattr.c =================================================================== --- head/sys/kern/vfs_extattr.c (revision 175201) +++ head/sys/kern/vfs_extattr.c (revision 175202) @@ -1,785 +1,785 @@ /*- * Copyright (c) 1999-2001 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Syscall to push extended attribute configuration information into the VFS. * Accepts a path, which it converts to a mountpoint, as well as a command * (int cmd), and attribute name and misc data. * * Currently this is used only by UFS1 extended attributes. */ int extattrctl(td, uap) struct thread *td; struct extattrctl_args /* { const char *path; int cmd; const char *filename; int attrnamespace; const char *attrname; } */ *uap; { struct vnode *filename_vp; struct nameidata nd; struct mount *mp, *mp_writable; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, fnvfslocked, error; AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(value, uap->attrnamespace); /* * uap->attrname is not always defined. We check again later when we * invoke the VFS call so as to pass in NULL there if needed. */ if (uap->attrname != NULL) { error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); } AUDIT_ARG(text, attrname); vfslocked = fnvfslocked = 0; /* * uap->filename is not always defined. If it is, grab a vnode lock, * which VFS_EXTATTRCTL() will later release. */ filename_vp = NULL; if (uap->filename != NULL) { NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF | AUDITVNODE2, UIO_USERSPACE, uap->filename, td); error = namei(&nd); if (error) return (error); fnvfslocked = NDHASGIANT(&nd); filename_vp = nd.ni_vp; NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); } /* uap->path is always defined. */ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) { if (filename_vp != NULL) vput(filename_vp); goto out; } vfslocked = NDHASGIANT(&nd); mp = nd.ni_vp->v_mount; error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); NDFREE(&nd, 0); if (error) { if (filename_vp != NULL) vput(filename_vp); goto out; } error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace, uap->attrname != NULL ? attrname : NULL, td); vn_finished_write(mp_writable); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp, * so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); out: VFS_UNLOCK_GIANT(fnvfslocked); VFS_UNLOCK_GIANT(vfslocked); return (error); } /*- * Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct mount *mp; struct uio auio; struct iovec aiov; ssize_t cnt; int error; VFS_ASSERT_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; cnt = nbytes; #ifdef MAC error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace, attrname, &auio); if (error) goto done; #endif error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_set_fd(td, uap) struct thread *td; struct extattr_set_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = extattr_set_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_set_file(td, uap) struct thread *td; struct extattr_set_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_set_link(td, uap) struct thread *td; struct extattr_set_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /*- * Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; struct iovec aiov; ssize_t cnt; size_t size, *sizep; int error; VFS_ASSERT_GIANT(vp->v_mount); VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the maximum * read length. */ auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace, attrname, &auio); if (error) goto done; #endif error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0, td); return (error); } int extattr_get_fd(td, uap) struct thread *td; struct extattr_get_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = extattr_get_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_get_file(td, uap) struct thread *td; struct extattr_get_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_get_link(td, uap) struct thread *td; struct extattr_get_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; VFS_ASSERT_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, td); #ifdef MAC done: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } int extattr_delete_fd(td, uap) struct thread *td; struct extattr_delete_fd_args /* { int fd; int attrnamespace; const char *attrname; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG(text, attrname); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace, attrname, td); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_delete_file(td, uap) struct thread *td; struct extattr_delete_file_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); AUDIT_ARG(text, attrname); NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } int extattr_delete_link(td, uap) struct thread *td; struct extattr_delete_link_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); AUDIT_ARG(text, attrname); NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /*- * Retrieve a list of extended attributes on a file or directory. * * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace", * userspace buffer pointer "data", buffer length "nbytes", * thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; ssize_t cnt; int error; VFS_ASSERT_GIANT(vp->v_mount); VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace); if (error) goto done; #endif error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0, td); return (error); } int extattr_list_fd(td, uap) struct thread *td; struct extattr_list_fd_args /* { int fd; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct file *fp; int vfslocked, error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(value, uap->attrnamespace); error = getvnode(td->td_proc->p_fd, uap->fd, &fp); if (error) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data, uap->nbytes, td); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_list_file(td, uap) struct thread*td; struct extattr_list_file_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } int extattr_list_link(td, uap) struct thread*td; struct extattr_list_link_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int vfslocked, error; AUDIT_ARG(value, uap->attrnamespace); NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } Index: head/sys/kern/vfs_lookup.c =================================================================== --- head/sys/kern/vfs_lookup.c (revision 175201) +++ head/sys/kern/vfs_lookup.c (revision 175202) @@ -1,1106 +1,1111 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include "opt_vfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC /* * Allocation zone for namei */ uma_zone_t namei_zone; /* * Placeholder vnode for mp traversal */ static struct vnode *vp_crossmp; static void nameiinit(void *dummy __unused) { int error; namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); error = getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp); if (error != 0) panic("nameiinit: getnewvnode"); vp_crossmp->v_vnlock->lk_flags &= ~LK_NOSHARE; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL) #ifdef LOOKUP_SHARED static int lookup_shared = 1; #else static int lookup_shared = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0, "Enables/Disables shared locks for path name translation"); /* * Convert a pathname into a pointer to a locked vnode. * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ int namei(struct nameidata *ndp) { struct filedesc *fdp; /* pointer to file descriptor state */ char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ struct uio auio; int error, linklen; struct componentname *cnp = &ndp->ni_cnd; struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; int vfslocked; KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0, ("NOT MPSAFE and Giant not held")); ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, ("namei: nameiop contaminated with flags")); KASSERT((cnp->cn_flags & OPMASK) == 0, ("namei: flags contaminated with nameiops")); if (!lookup_shared) cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; /* * Get a buffer for the name to be translated, and copy the * name into the buffer. */ if ((cnp->cn_flags & HASBUF) == 0) cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); else error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); /* If we are auditing the kernel pathname, save the user pathname. */ if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2); /* * Don't allow empty pathnames. */ if (!error && *cnp->cn_pnbuf == '\0') error = ENOENT; if (error) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif ndp->ni_vp = NULL; return (error); } ndp->ni_loopcnt = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_NAMEI)) { KASSERT(cnp->cn_thread == curthread, ("namei not using curthread")); ktrnamei(cnp->cn_pnbuf); } #endif /* * Get starting point for the translation. */ FILEDESC_SLOCK(fdp); ndp->ni_rootdir = fdp->fd_rdir; ndp->ni_topdir = fdp->fd_jdir; dp = fdp->fd_cdir; vfslocked = VFS_LOCK_GIANT(dp->v_mount); VREF(dp); FILEDESC_SUNLOCK(fdp); for (;;) { /* * Check if root directory should replace current directory. * Done at start of translation and after symbolic link. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { vrele(dp); VFS_UNLOCK_GIANT(vfslocked); while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } dp = ndp->ni_rootdir; vfslocked = VFS_LOCK_GIANT(dp->v_mount); VREF(dp); } if (vfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; ndp->ni_startdir = dp; error = lookup(ndp); if (error) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif return (error); } vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; ndp->ni_cnd.cn_flags &= ~GIANTHELD; /* * Check for symbolic link */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif } else cnp->cn_flags |= HASBUF; if ((cnp->cn_flags & MPSAFE) == 0) { VFS_UNLOCK_GIANT(vfslocked); } else if (vfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; return (0); } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; break; } #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp); if (error) break; } #endif if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = (struct thread *)0; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENOENT; break; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENAMETOOLONG; break; } if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; vput(ndp->ni_vp); dp = ndp->ni_dvp; } uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif vput(ndp->ni_vp); ndp->ni_vp = NULL; vrele(ndp->ni_dvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } static int compute_cn_lkflags(struct mount *mp, int lkflags) { if (mp == NULL || ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) { lkflags &= ~LK_SHARED; lkflags |= LK_EXCLUSIVE; } return lkflags; } /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is taken from ni_startdir. The pathname is * descended until done, or a symbolic link is encountered. The variable * ni_more is clear if the path is completed; it is set to one if a * symbolic link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. If flag has WANTPARENT or'ed into it, the parent directory is * returned unlocked. Otherwise the parent directory is not returned. If * the target of the pathname exists and LOCKLEAF is or'ed into the flag * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp * if WANTPARENT set, return unlocked parent in ni_dvp */ int lookup(struct nameidata *ndp) { char *cp; /* pointer into pathname argument */ struct vnode *dp = 0; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int trailing_slash; int error = 0; int dpunlocked = 0; /* dp has already been unlocked */ struct componentname *cnp = &ndp->ni_cnd; struct thread *td = cnp->cn_thread; int vfslocked; /* VFS Giant state for child */ int dvfslocked; /* VFS Giant state for parent */ int tvfslocked; int lkflags_save; /* * Setup: break out flag bits into variables. */ dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; vfslocked = 0; ndp->ni_cnd.cn_flags &= ~GIANTHELD; wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE && cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; ndp->ni_dvp = NULL; /* * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ if (lookup_shared) cnp->cn_lkflags = LK_SHARED; else cnp->cn_lkflags = LK_EXCLUSIVE; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; - vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td); + vn_lock(dp, + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); dirloop: /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ cnp->cn_consume = 0; for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (cnp->cn_namelen > NAME_MAX) { error = ENAMETOOLONG; goto bad; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *cp = c; } #endif ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ trailing_slash = 0; while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { trailing_slash = 1; *ndp->ni_next = '\0'; /* XXX for direnter() ... */ } } ndp->ni_next = cp; cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; if (*ndp->ni_next == 0) cnp->cn_flags |= ISLASTCN; else cnp->cn_flags &= ~ISLASTCN; /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (cnp->cn_nameiop != LOOKUP) { error = EISDIR; goto bad; } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); } ndp->ni_vp = dp; if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG(vnode, dp, ARG_VNODE1); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG(vnode, dp, ARG_VNODE2); if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) VOP_UNLOCK(dp, 0, td); /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); goto success; } /* * Handle "..": four special cases. * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 3. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. */ if (cnp->cn_flags & ISDOTDOT) { if ((cnp->cn_flags & ISLASTCN) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } for (;;) { if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode || ((dp->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT) != 0)) { ndp->ni_dvp = dp; ndp->ni_vp = dp; vfslocked = VFS_LOCK_GIANT(dp->v_mount); VREF(dp); goto nextname; } if ((dp->v_vflag & VV_ROOT) == 0) break; if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ error = EBADF; goto bad; } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; tvfslocked = dvfslocked; dvfslocked = VFS_LOCK_GIANT(dp->v_mount); VREF(dp); vput(tdp); VFS_UNLOCK_GIANT(tvfslocked); - vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td); + vn_lock(dp, + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | + LK_RETRY)); } } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_lookup(td->td_ucred, dp, cnp); if (error) goto bad; } #endif ndp->ni_dvp = dp; ndp->ni_vp = NULL; ASSERT_VOP_LOCKED(dp, "lookup"); VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked)); /* * If we have a shared lock we may need to upgrade the lock for the * last operation. */ if (dp != vp_crossmp && VOP_ISLOCKED(dp, td) == LK_SHARED && (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT)) - vn_lock(dp, LK_UPGRADE|LK_RETRY, td); + vn_lock(dp, LK_UPGRADE|LK_RETRY); /* * If we're looking up the last component and we need an exclusive * lock, adjust our lkflags. */ if ((cnp->cn_flags & (ISLASTCN|LOCKSHARED|LOCKLEAF)) == (ISLASTCN|LOCKLEAF)) cnp->cn_lkflags = LK_EXCLUSIVE; #ifdef NAMEI_DIAGNOSTIC vprint("lookup in", dp); #endif lkflags_save = cnp->cn_lkflags; cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags); if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) { cnp->cn_lkflags = lkflags_save; KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif if ((error == ENOENT) && (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { tdp = dp; dp = dp->v_mount->mnt_vnodecovered; tvfslocked = dvfslocked; dvfslocked = VFS_LOCK_GIANT(dp->v_mount); VREF(dp); vput(tdp); VFS_UNLOCK_GIANT(tvfslocked); - vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td); + vn_lock(dp, + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | + LK_RETRY)); goto unionlookup; } if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } if (*cp == '\0' && trailing_slash && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp, 0, td); /* * This is a temporary assert to make sure I know what the * behavior here was. */ KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0, ("lookup: Unhandled case.")); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } goto success; } else cnp->cn_lkflags = lkflags_save; #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif /* * Take into account any additional components consumed by * the underlying filesystem. */ if (cnp->cn_consume > 0) { cnp->cn_nameptr += cnp->cn_consume; ndp->ni_next += cnp->cn_consume; ndp->ni_pathlen -= cnp->cn_consume; cnp->cn_consume = 0; } dp = ndp->ni_vp; vfslocked = VFS_LOCK_GIANT(dp->v_mount); /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted filesystem. */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { if (vfs_busy(mp, 0, 0, td)) continue; vput(dp); VFS_UNLOCK_GIANT(vfslocked); vfslocked = VFS_LOCK_GIANT(mp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); VFS_UNLOCK_GIANT(dvfslocked); dvfslocked = 0; vref(vp_crossmp); ndp->ni_dvp = vp_crossmp; error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td); vfs_unbusy(mp, td); - if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT, td)) + if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { dpunlocked = 1; goto bad2; } ndp->ni_vp = dp = tdp; } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && ((cnp->cn_flags & FOLLOW) || trailing_slash || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; if (dp->v_iflag & VI_DOOMED) { /* We can't know whether the directory was mounted with * NOSYMFOLLOW, so we can't follow safely. */ error = EBADF; goto bad2; } if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { error = EACCES; goto bad2; } /* * Symlink code always expects an unlocked dvp. */ if (ndp->ni_dvp != ndp->ni_vp) VOP_UNLOCK(ndp->ni_dvp, 0, td); goto success; } /* * Check for bogus trailing slashes. */ if (trailing_slash && dp->v_type != VDIR) { error = ENOTDIR; goto bad2; } nextname: /* * Not a symbolic link. If more pathname, * continue at next component, else return. */ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', ("lookup: invalid path state.")); if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); VFS_UNLOCK_GIANT(dvfslocked); dvfslocked = vfslocked; /* dp becomes dvp in dirloop */ vfslocked = 0; goto dirloop; } /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) { if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); VFS_UNLOCK_GIANT(dvfslocked); dvfslocked = 0; } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) VOP_UNLOCK(ndp->ni_dvp, 0, td); if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG(vnode, dp, ARG_VNODE1); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG(vnode, dp, ARG_VNODE2); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0, td); success: /* * Because of lookup_shared we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if ((cnp->cn_flags & (ISLASTCN | LOCKSHARED | LOCKLEAF)) == (ISLASTCN | LOCKLEAF) && VOP_ISLOCKED(dp, td) != LK_EXCLUSIVE) { - vn_lock(dp, LK_UPGRADE | LK_RETRY, td); + vn_lock(dp, LK_UPGRADE | LK_RETRY); } if (vfslocked && dvfslocked) VFS_UNLOCK_GIANT(dvfslocked); /* Only need one */ if (vfslocked || dvfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; return (0); bad2: if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); bad: if (!dpunlocked) vput(dp); VFS_UNLOCK_GIANT(vfslocked); VFS_UNLOCK_GIANT(dvfslocked); ndp->ni_cnd.cn_flags &= ~GIANTHELD; ndp->ni_vp = NULL; return (error); } /* * relookup - lookup a path name component * Used by lookup to re-acquire things. */ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct thread *td = cnp->cn_thread; struct vnode *dp = 0; /* the directory we are searching */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; KASSERT(cnp->cn_flags & ISLASTCN, ("relookup: Not given last component.")); /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); KASSERT(wantparent, ("relookup: parent not wanted.")); rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; dp = dvp; cnp->cn_lkflags = LK_EXCLUSIVE; - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef NAMEI_DIAGNOSTIC printf("{%s}: ", cnp->cn_nameptr); #endif /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (cnp->cn_nameiop != LOOKUP || wantparent) { error = EISDIR; goto bad; } if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp, 0, td); *vpp = dp; /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } if (cnp->cn_flags & ISDOTDOT) panic ("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ #ifdef NAMEI_DIAGNOSTIC vprint("search in:", dp); #endif if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { KASSERT(*vpp == NULL, ("leaf should be empty")); if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp, 0, td); /* * This is a temporary assert to make sure I know what the * behavior here was. */ KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0, ("relookup: Unhandled case.")); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ return (0); } dp = *vpp; /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (dvp == dp) vrele(dvp); else vput(dvp); error = EROFS; goto bad; } /* * Set the parent lock/ref state to the requested state. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) { if (wantparent) VOP_UNLOCK(dvp, 0, td); else vput(dvp); } else if (!wantparent) vrele(dvp); /* * Check for symbolic link */ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), ("relookup: symlink found.\n")); /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0, td); return (0); bad: vput(dp); *vpp = NULL; return (error); } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(struct nameidata *ndp, const u_int flags) { int unlock_dvp; int unlock_vp; unlock_dvp = 0; unlock_vp = 0; if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) unlock_vp = 1; if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { if (unlock_vp) { vput(ndp->ni_vp); unlock_vp = 0; } else vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (unlock_vp) VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) unlock_dvp = 1; if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { if (unlock_dvp) { vput(ndp->ni_dvp); unlock_dvp = 0; } else vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (unlock_dvp) VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Determine if there is a suitable alternate filename under the specified * prefix for the specified path. If the create flag is set, then the * alternate prefix will be used so long as the parent directory exists. * This is used by the various compatiblity ABIs so that Linux binaries prefer * files under /compat/linux for example. The chosen path (whether under * the prefix or under /) is returned in a kernel malloc'd buffer pointed * to by pathbuf. The caller is responsible for free'ing the buffer from * the M_TEMP bucket if one is returned. */ int kern_alternate_path(struct thread *td, const char *prefix, char *path, enum uio_seg pathseg, char **pathbuf, int create) { struct nameidata nd, ndroot; char *ptr, *buf, *cp; size_t len, sz; int error; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pathbuf = buf; /* Copy the prefix into the new pathname as a starting point. */ len = strlcpy(buf, prefix, MAXPATHLEN); if (len >= MAXPATHLEN) { *pathbuf = NULL; free(buf, M_TEMP); return (EINVAL); } sz = MAXPATHLEN - len; ptr = buf + len; /* Append the filename to the prefix. */ if (pathseg == UIO_SYSSPACE) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { *pathbuf = NULL; free(buf, M_TEMP); return (error); } /* Only use a prefix with absolute pathnames. */ if (*ptr != '/') { error = EINVAL; goto keeporig; } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (create) { for (cp = &ptr[len] - 1; *cp != '/'; cp--); *cp = '\0'; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td); error = namei(&nd); *cp = '/'; if (error != 0) goto keeporig; } else { NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error != 0) goto keeporig; /* * We now compare the vnode of the prefix to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix, td); /* We shouldn't ever get an error from this namei(). */ error = namei(&ndroot); if (error == 0) { if (nd.ni_vp == ndroot.ni_vp) error = ENOENT; NDFREE(&ndroot, NDF_ONLY_PNBUF); vrele(ndroot.ni_vp); VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot)); } } NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(NDHASGIANT(&nd)); keeporig: /* If there was an error, use the original path name. */ if (error) bcopy(ptr, buf, len); return (error); } Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 175201) +++ head/sys/kern/vfs_mount.c (revision 175202) @@ -1,2309 +1,2309 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_rootdevname.h" #include "opt_ddb.h" #include "opt_mac.h" #ifdef DDB #include #endif #define ROOTNAME "root_device" #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, int fsflags, void *fsdata); static int vfs_mountroot_ask(void); static int vfs_mountroot_try(const char *mountfrom); static int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions); static void free_mntarg(struct mntarg *ma); static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store */ /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", "update", NULL }; /* * The root specifiers we will try if RB_CDROM is specified. */ static char *cdrom_rootdevnames[] = { "cd9660:cd0", "cd9660:acd0", NULL }; /* legacy find-root code */ char *rootdevnames[2] = {NULL, NULL}; #ifndef ROOTDEVNAME # define ROOTDEVNAME NULL #endif static const char *ctrootdevname = ROOTDEVNAME; /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); #ifdef INVARIANTS else if (opt->len != 0) panic("%s: mount option with NULL value but length != 0", __func__); #endif free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ static int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused; unsigned int i, iovcnt; int error, namelen, optlen; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *new; TAILQ_FOREACH(opt, opts, link) { /* * Check that this option hasn't been redefined * nor cancelled with a "no" mount option. */ opt2 = TAILQ_FIRST(toopts); while (opt2 != NULL) { if (strcmp(opt2->name, opt->name) == 0) goto next; if (strncmp(opt2->name, "no", 2) == 0 && strcmp(opt2->name + 2, opt->name) == 0) { vfs_freeopt(toopts, opt2); goto next; } opt2 = TAILQ_NEXT(opt2, link); } /* We want this option, duplicate it. */ new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK); strcpy(new->name, opt->name); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else { new->value = NULL; } new->len = opt->len; TAILQ_INSERT_TAIL(toopts, new, link); next: continue; } } /* * Mount a filesystem. */ int nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; struct iovec *iov; unsigned int i; int error; u_int iovcnt; AUDIT_ARG(fflags, uap->flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try(). */ uap->flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) return (EINVAL); error = copyinuio(uap->iovp, iovcnt, &auio); if (error) return (error); iov = auio->uio_iov; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > MMAXOPTIONLEN) { free(auio, M_IOV); return (EINVAL); } iov++; } error = vfs_donmount(td, uap->flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; lockdestroy(&mp->mnt_lock); mtx_destroy(&mp->mnt_mtx); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct thread *td) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, LK_NOWAIT, 0, td); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; vfsp->vfc_refcount++; /* XXX Unlocked */ mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(td->td_ucred); mp->mnt_stat.f_owner = td->td_ucred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(td->td_ucred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { int i; MNT_ILOCK(mp); for (i = 0; mp->mnt_ref && i < 3; i++) msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz); /* * This will always cause a 3 second delay in rebooting due to * refs on the root mountpoint that never go away. Most of these * are held by init which never exits. */ if (i == 3 && (!rebooting || bootverbose)) printf("Mount point %s had %d dangling refs\n", mp->mnt_stat.f_mntonname, mp->mnt_ref); if (mp->mnt_holdcnt != 0) { printf("Waiting for mount point to be unheld\n"); while (mp->mnt_holdcnt != 0) { mp->mnt_holdcntwaiters++; msleep(&mp->mnt_holdcnt, MNT_MTX(mp), PZERO, "mntdestroy", 0); mp->mnt_holdcntwaiters--; } printf("mount point unheld\n"); } if (mp->mnt_writeopcount > 0) { printf("Waiting for mount point write ops\n"); while (mp->mnt_writeopcount > 0) { mp->mnt_kern_flag |= MNTK_SUSPEND; msleep(&mp->mnt_writeopcount, MNT_MTX(mp), PZERO, "mntdestroy2", 0); } printf("mount point write ops completed\n"); } if (mp->mnt_secondary_writes > 0) { printf("Waiting for mount point secondary write ops\n"); while (mp->mnt_secondary_writes > 0) { mp->mnt_kern_flag |= MNTK_SUSPEND; msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), PZERO, "mntdestroy3", 0); } printf("mount point secondary write ops completed\n"); } MNT_IUNLOCK(mp); mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vprint("", vp); panic("unmount: dangling vnode"); } MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); mp->mnt_writeopcount = -1000; mp->mnt_nvnodelistsize = -1000; mp->mnt_secondary_writes = -1000; MNT_IUNLOCK(mp); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } static int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *noro_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; int has_rw, has_noro; errmsg = NULL; errmsg_len = 0; errmsg_pos = -1; has_rw = 0; has_noro = 0; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH(opt, optlist, link) { if (strcmp(opt->name, "update") == 0) fsflags |= MNT_UPDATE; else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) fsflags |= MNT_FORCE; else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; has_noro = 1; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; has_rw = 1; } else if (strcmp(opt->name, "ro") == 0) fsflags |= MNT_RDONLY; else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; } else if (strcmp(opt->name, "snapshot") == 0) fsflags |= MNT_SNAPSHOT; else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; } /* * If "rw" was specified as a mount option, and we * are trying to update a mount-point from "ro" to "rw", * we need a mount option "noro", since in vfs_mergeopts(), * "noro" will cancel "ro", but "rw" will not do anything. */ if (has_rw && !has_noro) { noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); noro_opt->name = strdup("noro", M_MOUNT); noro_opt->value = NULL; noro_opt->len = 0; TAILQ_INSERT_TAIL(optlist, noro_opt, link); } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) { error = ENAMETOOLONG; goto bail; } mtx_lock(&Giant); error = vfs_domount(td, fstype, fspath, fsflags, optlist); mtx_unlock(&Giant); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (error != 0) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; int error; AUDIT_ARG(fflags, uap->flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try(). */ uap->flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG(text, fstype); mtx_lock(&Giant); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) { mtx_unlock(&Giant); return (ENOENT); } if (vfsp->vfc_vfsops->vfs_cmount == NULL) { mtx_unlock(&Giant); return (EOPNOTSUPP); } ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ int fsflags, /* Flags common to all filesystems. */ void *fsdata /* Options local to the filesystem. */ ) { struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; struct export_args export; int error, flag = 0; struct vattr va; struct nameidata nd; mtx_assert(&Giant, MA_OWNED); /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL)) return (EPERM); } /* * Get vnode to be covered */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (fsflags & MNT_UPDATE) { if ((vp->v_vflag & VV_ROOT) == 0) { vput(vp); return (EINVAL); } mp = vp->v_mount; MNT_ILOCK(mp); flag = mp->mnt_flag; /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((fsflags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { MNT_IUNLOCK(mp); vput(vp); return (EOPNOTSUPP); /* Needs translation */ } MNT_IUNLOCK(mp); /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error) { vput(vp); return (error); } if (vfs_busy(mp, LK_NOWAIT, 0, td)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp, td); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); MNT_ILOCK(mp); mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS); MNT_IUNLOCK(mp); VOP_UNLOCK(vp, 0, td); mp->mnt_optnew = fsdata; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); } else { /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred, td); if (error) { vput(vp); return (error); } if (va.va_uid != td->td_ucred->cr_uid) { error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0); if (error) { vput(vp); return (error); } } error = vinvalbuf(vp, V_SAVE, td, 0, 0); if (error != 0) { vput(vp); return (error); } if (vp->v_type != VDIR) { vput(vp); return (ENOTDIR); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); /* * Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td); VOP_UNLOCK(vp, 0, td); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = fsdata; } /* * Set the mount level flags. */ MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) | (fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS | MNT_RDONLY)); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, td); /* * Process the export option only if we are * updating mount options. */ if (!error && (fsflags & MNT_UPDATE)) { if (vfs_copyopt(mp->mnt_optnew, "export", &export, sizeof(export)) == 0) error = vfs_export(mp, &export); } if (!error) { if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; (void)VFS_STATFS(mp, &mp->mnt_stat, td); } /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if (mp->mnt_flag & MNT_UPDATE) { MNT_ILOCK(mp); if (error) mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); else mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; } vfs_unbusy(mp, td); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); if (!error) { struct vnode *newdp; VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td)) panic("mount: lost mount"); mountcheckdirs(vp, newdp); vput(newdp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if (error) vrele(vp); } else { VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vfs_unbusy(mp, td); vfs_mount_destroy(mp); vput(vp); } return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int unmount(td, uap) struct thread *td; register struct unmount_args /* { char *path; int flags; } */ *uap; { struct mount *mp; char *pathbuf; int error, id0, id1; if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1); mtx_lock(&Giant); if (uap->flags & MNT_BYFSID) { /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { mtx_unlock(&Giant); free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) break; } mtx_unlock(&mountlist_mtx); } else { mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) break; } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ mtx_unlock(&Giant); return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { mtx_unlock(&Giant); return (EINVAL); } error = dounmount(mp, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * Do the actual filesystem unmount. */ int dounmount(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct vnode *coveredvp, *fsrootvp; int error; int async_flag; int mnt_gen_r; mtx_assert(&Giant, MA_OWNED); if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); - vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, td); + vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); vdrop(coveredvp); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0, td); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error) { if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (error); } MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ; /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) mp->mnt_kern_flag |= MNTK_UNMOUNTF; error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td); if (error) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ | MNTK_UNMOUNTF); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (error); } vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); cache_purgevfs(mp); /* remove cache entries for this file sys */ if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, td); } vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ; if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) { MNT_IUNLOCK(mp); (void) vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vput(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); vfs_mount_destroy(mp); return (0); } /* * --------------------------------------------------------------------- * Mounting of root filesystem * */ struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(&root_holds); static int root_mount_complete; /* * Hold root mount. */ struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&mountlist_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&mountlist_mtx); return (h); } /* * Release root mount. */ void root_mount_rel(struct root_hold_token *h) { mtx_lock(&mountlist_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&mountlist_mtx); free(h, M_DEVBUF); } /* * Wait for all subsystems to release root mount. */ static void root_mount_prepare(void) { struct root_hold_token *h; for (;;) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&mountlist_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&mountlist_mtx); break; } printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold", hz); } } /* * Root was mounted, share the good news. */ static void root_mount_done(void) { /* * Use a mutex to prevent the wakeup being missed and waiting for * an extra 1 second sleep. */ mtx_lock(&mountlist_mtx); root_mount_complete = 1; wakeup(&root_mount_complete); mtx_unlock(&mountlist_mtx); } /* * Return true if root is already mounted. */ int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } /* * Wait until root is mounted. */ void root_mount_wait(void) { /* * Panic on an obvious deadlock - the function can't be called from * a thread which is doing the whole SYSINIT stuff. */ KASSERT(curthread->td_proc->p_pid != 0, ("root_mount_wait: cannot be called from the swapper thread")); mtx_lock(&mountlist_mtx); while (!root_mount_complete) { msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait", hz); } mtx_unlock(&mountlist_mtx); } static void set_rootvnode(struct thread *td) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td)) panic("Cannot find root vnode"); p = td->td_proc; FILEDESC_SLOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_SUNLOCK(p->p_fd); VOP_UNLOCK(rootvnode, 0, td); } /* * Mount /devfs as our root filesystem, but do not put it on the mountlist * yet. Create a /dev -> / symlink so that absolute pathnames will lookup. */ static void devfs_first(void) { struct thread *td = curthread; struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp = NULL; int error; vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return; mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td); error = VFS_MOUNT(mp, td); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); set_rootvnode(td); error = kern_symlink(td, "/", "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); } /* * Surgically move our devfs to be mounted on /dev. */ static void devfs_fixup(struct thread *td) { struct nameidata nd; int error; struct vnode *vp, *dvp; struct mount *mp; /* Remove our devfs mount from the mountlist and purge the cache */ mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mp); VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td); VI_LOCK(dvp); dvp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(dvp); dvp->v_mountedhere = NULL; /* Set up the real rootvnode, and purge the cache */ TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL; set_rootvnode(td); cache_purgevfs(rootvnode->v_mount); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (error) { printf("Lookup of /dev for devfs, error: %d\n", error); return; } NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type != VDIR) { vput(vp); } error = vinvalbuf(vp, V_SAVE, td, 0, 0); if (error) { vput(vp); } cache_purge(vp); mp->mnt_vnodecovered = vp; vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); VOP_UNLOCK(vp, 0, td); vput(dvp); vfs_unbusy(mp, td); /* Unlink the no longer needed /dev/dev -> / symlink */ kern_unlink(td, "/dev/dev", UIO_SYSSPACE); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * Find and mount the root filesystem */ void vfs_mountroot(void) { char *cp; int error, i, asked = 0; root_mount_prepare(); mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); devfs_first(); /* * We are booted with instructions to prompt for the root filesystem. */ if (boothowto & RB_ASKNAME) { if (!vfs_mountroot_ask()) goto mounted; asked = 1; } /* * The root filesystem information is compiled in, and we are * booted with instructions to use it. */ if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) { if (!vfs_mountroot_try(ctrootdevname)) goto mounted; ctrootdevname = NULL; } /* * We've been given the generic "use CDROM as root" flag. This is * necessary because one media may be used in many different * devices, so we need to search for them. */ if (boothowto & RB_CDROM) { for (i = 0; cdrom_rootdevnames[i] != NULL; i++) { if (!vfs_mountroot_try(cdrom_rootdevnames[i])) goto mounted; } } /* * Try to use the value read by the loader from /etc/fstab, or * supplied via some other means. This is the preferred * mechanism. */ cp = getenv("vfs.root.mountfrom"); if (cp != NULL) { error = vfs_mountroot_try(cp); freeenv(cp); if (!error) goto mounted; } /* * Try values that may have been computed by code during boot */ if (!vfs_mountroot_try(rootdevnames[0])) goto mounted; if (!vfs_mountroot_try(rootdevnames[1])) goto mounted; /* * If we (still) have a compiled-in default, try it. */ if (ctrootdevname != NULL) if (!vfs_mountroot_try(ctrootdevname)) goto mounted; /* * Everything so far has failed, prompt on the console if we haven't * already tried that. */ if (!asked) if (!vfs_mountroot_ask()) goto mounted; panic("Root mount failed, startup aborted."); mounted: root_mount_done(); } /* * Mount (mountfrom) as the root filesystem. */ static int vfs_mountroot_try(const char *mountfrom) { struct mount *mp; char *vfsname, *path; time_t timebase; int error; char patt[32]; vfsname = NULL; path = NULL; mp = NULL; error = EINVAL; if (mountfrom == NULL) return (error); /* don't complain */ printf("Trying to mount root from %s\n", mountfrom); /* parse vfs name and path */ vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK); path = malloc(MNAMELEN, M_MOUNT, M_WAITOK); vfsname[0] = path[0] = 0; sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN); if (sscanf(mountfrom, patt, vfsname, path) < 1) goto out; if (path[0] == '\0') strcpy(path, ROOTNAME); error = kernel_vmount( MNT_RDONLY | MNT_ROOTFS, "fstype", vfsname, "fspath", "/", "from", path, NULL); if (error == 0) { /* * We mount devfs prior to mounting the / FS, so the first * entry will typically be devfs. */ mp = TAILQ_FIRST(&mountlist); KASSERT(mp != NULL, ("%s: mountlist is empty", __func__)); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Typically devfs has no time stamp and the only other FS * is the actual / FS. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; do { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } while (mp != NULL); inittodr(timebase); devfs_fixup(curthread); } out: free(path, M_MOUNT); free(vfsname, M_MOUNT); return (error); } /* * --------------------------------------------------------------------- * Interactive root filesystem selection code. */ static int vfs_mountroot_ask(void) { char name[128]; for(;;) { printf("\nManual root filesystem specification:\n"); printf(" : Mount using filesystem \n"); #if defined(__amd64__) || defined(__i386__) || defined(__ia64__) printf(" eg. ufs:da0s1a\n"); #else printf(" eg. ufs:/dev/da0a\n"); #endif printf(" ? List valid disk boot devices\n"); printf(" Abort manual input\n"); printf("\nmountroot> "); gets(name, sizeof(name), 1); if (name[0] == '\0') return (1); if (name[0] == '?') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (!vfs_mountroot_try(name)) return (0); } } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; sprintf(errmsg, "mount option <%s> is unknown", p); printf("%s\n", errmsg); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); } } } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; int i; if (opts == NULL) return (-1); i = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) return (i); ++i; } return (-1); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } /* * This is a helper function for filesystems to traverse their * vnodes. See MNT_VNODE_FOREACH() in sys/mount.h */ struct vnode * __mnt_vnode_next(struct vnode **mvp, struct mount *mp) { struct vnode *vp; mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); vp = TAILQ_NEXT(*mvp, v_nmntvnodes); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { __mnt_vnode_markerfree(mvp, mp); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } struct vnode * __mnt_vnode_first(struct vnode **mvp, struct mount *mp) { struct vnode *vp; mtx_assert(MNT_MTX(mp), MA_OWNED); vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { *mvp = NULL; return (NULL); } mp->mnt_holdcnt++; MNT_IUNLOCK(mp); *mvp = (struct vnode *) malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); (*mvp)->v_type = VMARKER; vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); MNT_ILOCK(mp); *mvp = NULL; mp->mnt_holdcnt--; if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0) wakeup(&mp->mnt_holdcnt); return (NULL); } mp->mnt_markercnt++; (*mvp)->v_mount = mp; TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); MNT_ILOCK(mp); *mvp = NULL; mp->mnt_markercnt--; mp->mnt_holdcnt--; if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0) wakeup(&mp->mnt_holdcnt); } int __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, use printf. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, int flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, -1); } va_end(ap); error = kernel_mount(ma, flags); return (error); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 175201) +++ head/sys/kern/vfs_subr.c (revision 175202) @@ -1,4031 +1,4031 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); static void vinactive(struct vnode *, struct thread *); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vfree(struct vnode *); static void vnlru_free(int); static void vdestroy(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static int vfs_knllocked(void *arg); /* * Enable Giant pushdown based on whether or not the vm is mpsafe in this * build. Without mpsafevm the buffer cache can not run Giant free. */ int mpsafe_vfs = 1; TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, "MPSAFE VFS"); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed * vnode. */ static unsigned long numvnodes; SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. */ #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) /* * Initialize the vnode management data structures. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX 100000 #endif static void vntblinit(void *dummy __unused) { /* * Desiredvnodes is a function of the physical memory size and * the kernel's heap size. Specifically, desiredvnodes scales * in proportion to the physical memory size until two fifths * of the kernel's heap size is consumed by vnodes and vm * objects. */ desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(struct mount *mp, int flags, struct mtx *interlkp, struct thread *td) { int lkflags; MNT_ILOCK(mp); MNT_REF(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (ENOENT); } if (interlkp) mtx_unlock(interlkp); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0); MNT_REL(mp); MNT_IUNLOCK(mp); if (interlkp) mtx_lock(interlkp); return (ENOENT); } if (interlkp) mtx_unlock(interlkp); lkflags = LK_SHARED | LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp, struct thread *td) { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); vfs_rel(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL)) return (EPERM); /* * If the file system was mounted outside a jail and a jailed thread * tries to access it, deny immediately. */ if (!jailed(mp->mnt_cred) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted inside different jail that the jail of * the calling thread, deny immediately. */ if (jailed(mp->mnt_cred) && jailed(td->td_ucred) && mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) { return (EPERM); } if ((mp->mnt_flag & MNT_USER) == 0 || mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static u_int16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct thread *td; struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = cnt.v_page_count * 2 / usevnodes; done = 0; td = curthread; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK, td); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0, td); vdropl(vp); done++; next_iter_mntunlocked: if ((count % 256) != 0) goto relock_mnt; goto yield; next_iter: if ((count % 256) != 0) continue; MNT_IUNLOCK(mp); yield: uio_yield(); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; int vfslocked; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } VNASSERT(VCANRECYCLE(vp), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; vholdl(vp); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vtryrecycle(vp); VFS_UNLOCK_GIANT(vfslocked); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; struct thread *td = curthread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); mtx_lock(&Giant); for (;;) { kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { int vfsunlocked; if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (!VFS_NEEDSGIANT(mp)) { mtx_unlock(&Giant); vfsunlocked = 1; } else vfsunlocked = 0; done += vlrureclaim(mp); if (vfsunlocked) mtx_lock(&Giant); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (done == 0) { EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10); #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else uio_yield(); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) /* * Routines having to do with the management of the vnode table. */ static void vdestroy(struct vnode *vp) { struct bufobj *bo; CTR1(KTR_VFS, "vdestroy vp %p", vp); mtx_lock(&vnode_free_list_mtx); numvnodes--; mtx_unlock(&vnode_free_list_mtx); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note); mtx_destroy(&vp->v_pollinfo->vpi_lock); uma_zfree(vnodepoll_zone, vp->v_pollinfo); } #ifdef INVARIANTS /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); uma_zfree(vnode_zone, vp); } /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct thread *td = curthread; struct mount *vnmp; CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0) return (EWOULDBLOCK); /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0, td); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK, td); vn_finished_write(vnmp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK, td); vn_finished_write(vnmp); CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp); return (0); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp = NULL; struct bufobj *bo; mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); /* * Wait for available vnodes. */ if (numvnodes > desiredvnodes) { if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); goto alloc; } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (numvnodes > desiredvnodes) { mtx_unlock(&vnode_free_list_mtx); return (ENFILE); } #endif } alloc: numvnodes++; mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; bo->bo_mtx = &vp->v_interlock; bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = 0; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); else if (mp == NULL) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp); *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { struct thread *td; td = curthread; /* XXX ? */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; /* XXX non mp-safe fs may still call insmntque with vnode unlocked */ if (!VOP_ISLOCKED(vp, td)) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && mp->mnt_nvnodelistsize == 0) { MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_UNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); VM_OBJECT_UNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo) { CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: VI_LOCK(vp); bo = &vp->v_bufobj; anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } VI_LOCK(vp); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } VI_LOCK(vp); } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); VI_LOCK(vp); goto restartsync; } } bufobj_wwait(bo, 0, 0); VI_UNLOCK(vp); vnode_pager_setsize(vp, length); return (0); } /* * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. */ static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) { struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root == NULL) return (NULL); lefttreemax = righttreemin = &dummy; for (;;) { if (lblkno < root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_left) == NULL) break; if (lblkno < y->b_lblkno) { /* Rotate right. */ root->b_left = y->b_right; y->b_right = root; root = y; if ((y = root->b_left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->b_left = root; righttreemin = root; } else if (lblkno > root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_right) == NULL) break; if (lblkno > y->b_lblkno) { /* Rotate left. */ root->b_right = y->b_left; y->b_left = root; root = y; if ((y = root->b_right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->b_right = root; lefttreemax = root; } else { break; } root = y; } /* Assemble the new root. */ lefttreemax->b_right = root->b_left; righttreemin->b_left = root->b_right; root->b_left = dummy.b_right; root->b_right = dummy.b_left; return (root); } static void buf_vlist_remove(struct buf *bp) { struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; if (bp != bv->bv_root) { root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); KASSERT(root == bp, ("splay lookup failed in remove")); } if (bp->b_left == NULL) { root = bp->b_right; } else { root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); root->b_right = bp->b_right; } bv->bv_root = root; TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); if (root == NULL) { bp->b_left = NULL; bp->b_right = NULL; TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); } else if (bp->b_lblkno < root->b_lblkno || (bp->b_lblkno == root->b_lblkno && (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { bp->b_left = root->b_left; bp->b_right = root; root->b_left = NULL; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); } else { bp->b_right = root->b_right; bp->b_left = root; root->b_right = NULL; TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); } bv->bv_cnt++; bv->bv_root = bp; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_clean.bv_root) != NULL) { bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } if ((bp = bo->bo_dirty.bv_root) != NULL) { bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } return (NULL); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); ASSERT_VI_LOCKED(vp, "bgetvp"); vholdl(vp); if (VFS_NEEDSGIANT(vp->v_mount) || vp->v_bufobj.bo_flag & BO_NEEDSGIANT) bp->b_flags |= B_NEEDSGIANT; bp->b_vp = vp; bp->b_bufobj = &vp->v_bufobj; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_flags &= ~B_NEEDSGIANT; bp->b_vp = NULL; bp->b_bufobj = NULL; vdropl(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_LOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; int vfslocked; vfslocked = 0; restart: *bo = LIST_FIRST(slp); if (*bo == NULL) { VFS_UNLOCK_GIANT(vfslocked); return (0); } vp = (*bo)->__bo_vnode; /* XXX */ if (VFS_NEEDSGIANT(vp->v_mount)) { if (!vfslocked) { vfslocked = 1; if (mtx_trylock(&Giant) == 0) { mtx_unlock(&sync_mtx); mtx_lock(&Giant); mtx_lock(&sync_mtx); goto restart; } } } else { VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; } if (VOP_ISLOCKED(vp, NULL) != 0) { VFS_UNLOCK_GIANT(vfslocked); return (1); } if (VI_TRYLOCK(vp) == 0) { VFS_UNLOCK_GIANT(vfslocked); return (1); } /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&sync_mtx); return (1); } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VI_LOCK(vp); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } vdropl(vp); VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&sync_mtx); return (0); } /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next; struct synclist *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; static int dummychan; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING) msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl", hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { struct thread *td; int ret = 0; td = FIRST_THREAD_IN_PROC(updateproc); mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); sleepq_remove(td, &lbolt); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { struct thread *td; if (howto & RB_NOSYNC) return; td = FIRST_THREAD_IN_PROC(updateproc); mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); sleepq_remove(td, &lbolt); kproc_shutdown(arg, howto); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ VI_LOCK(vp); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif VI_UNLOCK(vp); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } vholdl(vp); } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new filesystem type). */ int vget(struct vnode *vp, int flags, struct thread *td) { int oweinact; int oldflags; int error; error = 0; oldflags = flags; oweinact = 0; VFS_ASSERT_GIANT(vp->v_mount); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); /* * If the inactive call was deferred because vput() was called * with a shared lock, we have to do it here before another thread * gets a reference to data that should be dead. */ if (vp->v_iflag & VI_OWEINACT) { if (flags & LK_NOWAIT) { VI_UNLOCK(vp); return (EBUSY); } flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; oweinact = 1; } vholdl(vp); - if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { + if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { vdrop(vp); return (error); } VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); if (oweinact) { if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VI_UNLOCK(vp); if ((oldflags & LK_TYPE_MASK) == 0) VOP_UNLOCK(vp, 0, td); } else VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { struct thread *td = curthread; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vrele: missed vn_close")); if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif VI_UNLOCK(vp); panic("vrele: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; - if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { + if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0, td); } else { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; } vdropl(vp); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { struct thread *td = curthread; /* XXX */ int error; KASSERT(vp != NULL, ("vput: null vp")); ASSERT_VOP_LOCKED(vp, "vput"); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vput: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { VOP_UNLOCK(vp, 0, td); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); vp->v_iflag |= VI_OWEINACT; if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT, td); VI_LOCK(vp); if (error) { if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; goto done; } } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0, td); done: vdropl(vp); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we will free it if it has been vgone'd otherwise it is * placed on the free list. */ void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vdropl"); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt == 0) { if (vp->v_iflag & VI_DOOMED) { vdestroy(vp); return; } else vfree(vp); } VI_UNLOCK(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ static void vinactive(struct vnode *vp, struct thread *td) { ASSERT_VOP_LOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush( struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR1(KTR_VFS, "vflush: mp %p", mp); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) return (error); vput(rootvp); } MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); vholdl(vp); MNT_IUNLOCK(mp); - error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td); + error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0, td); vdropl(vp); MNT_ILOCK(mp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0, td); vdropl(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td); vgone(rootvp); VOP_UNLOCK(rootvp, 0, td); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) return (EBUSY); for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp, struct thread *td) { int recycled; ASSERT_VOP_LOCKED(vp, "vrecycle"); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; CTR1(KTR_VFS, "vgonel: vp %p", vp); ASSERT_VOP_LOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0) vinvalbuf(vp, 0, td, 0, 0); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_TEXT) strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_AGE) strlcat(buf, "|VI_AGE", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_OBJDIRTY) strlcat(buf, "|VI_OBJDIRTY", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count); printf(" "); lockmgr_printinfo(vp->v_vnlock); printf("\n"); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp, *nmp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp, NULL)) vprint("", vp); } nmp = TAILQ_NEXT(mp, mnt_list); } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static void vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) { strcpy(xvfsp->vfc_name, vfsp->vfc_name); xvfsp->vfc_typenum = vfsp->vfc_typenum; xvfsp->vfc_refcount = vfsp->vfc_refcount; xvfsp->vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp->vfc_vfsops = NULL; xvfsp->vfc_next = NULL; } /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; struct xvfsconf xvfsp; int error; error = 0; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); if (error) break; } return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; struct xvfsconf xvfsp; printf("WARNING: userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct thread *td = req->td; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp, td); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) && (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { MNT_IUNLOCK(mp); if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); MNT_ILOCK(mp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); } vput(vp); } MNT_ILOCK(mp); } else VI_UNLOCK(vp); } MNT_IUNLOCK(mp); } /* * Mark a vnode as free, putting it up for recycling. */ static void vfree(struct vnode *vp) { CTR1(KTR_VFS, "vfree vp %p", vp); ASSERT_VI_LOCKED(vp, "vfree"); mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } /* * Opposite of vfree() - mark a vnode as in use. */ static void vbusy(struct vnode *vp) { CTR1(KTR_VFS, "vbusy vp %p", vp); ASSERT_VI_LOCKED(vp, "vbusy"); VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); mtx_unlock(&vnode_free_list_mtx); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); if (vp->v_pollinfo != NULL) { uma_zfree(vnodepoll_zone, vi); return; } vp->v_pollinfo = vi; mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knllocked); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { if (vp->v_pollinfo == NULL) v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return events; } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return 0; } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque failed"); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } VI_LOCK(vp); vn_syncer_add_to_worklist(&vp->v_bufobj, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; mtx_unlock(&sync_mtx); VI_UNLOCK(vp); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct thread *td = ap->a_td; int error; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, td); return (0); } MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY, td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); vfs_unbusy(mp, td); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; VI_LOCK(vp); bo = &vp->v_bufobj; vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } VI_UNLOCK(vp); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. * * The ifdef'd CAPABILITIES version is here for reference, but is not * actually used. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, mode_t acc_mode, struct ucred *cred, int *privused) { mode_t dac_granted; mode_t priv_granted; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, int access) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, access, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp != a->a_fvp) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (BUF_REFCNT(bp) < 1) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } #endif } void vop_lookup_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); #endif } void vop_lookup_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; struct vnode *vp; a = ap; dvp = a->a_dvp; vp = *(a->a_vpp); ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); if (!rc) ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init(&fs_knlist, NULL, NULL, NULL, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { 1, NULL, filt_vfsdetach, filt_vfsread }; static struct filterops vfswrite_filtops = { 1, NULL, filt_vfsdetach, filt_vfswrite }; static struct filterops vfsvnode_filtops = { 1, NULL, filt_vfsdetach, filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0, curthread); } static int vfs_knllocked(void *arg) { struct vnode *vp = arg; return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE); } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; if (vp->v_pollinfo == NULL) v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) return (0); kn->kn_data = va.va_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VA_MARK_ATIME. This functionality is used by execve * and mmap, so we want to avoid the synchronous I/O implied by * directly setting va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct thread *td) { struct vattr atimeattr; if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { VATTR_NULL(&atimeattr); atimeattr.va_vaflags |= VA_MARK_ATIME; (void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td); } } Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 175201) +++ head/sys/kern/vfs_syscalls.c (revision 175202) @@ -1,4264 +1,4264 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int chroot_refuse_vdir_fds(struct filedesc *fdp); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); static int setfmode(struct thread *td, struct vnode *, int); static int setfflags(struct thread *td, struct vnode *, int); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); /* * The module initialization routine for POSIX asynchronous I/O will * set this to the version of AIO that it implements. (Zero means * that it is not implemented.) This value is used here by pathconf() * and in kern_descrip.c by fpathconf(). */ int async_io_version; #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int vfslocked; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); } VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { char *path; int cmd; int uid; caddr_t arg; } */ *uap; { struct mount *mp; int vfslocked; int error; struct nameidata nd; AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(uid, uap->uid); if (jailed(td->td_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; if ((error = vfs_busy(mp, 0, NULL, td))) { vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } vrele(nd.ni_vp); error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td); vfs_unbusy(mp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int statfs(td, uap) struct thread *td; register struct statfs_args /* { char *path; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct statfs *sp, sb; int vfslocked; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); mp = nd.ni_vp->v_mount; vfs_ref(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("statfs(%d): %s: %d\n", vfslocked, path, error); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { int fd; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct statfs *sp, sb; int vfslocked; struct vnode *vp; int error; AUDIT_ARG(fd, fd); error = getvnode(td->td_proc->p_fd, fd, &fp); if (error) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef AUDIT AUDIT_ARG(vnode, vp, ARG_VNODE1); #endif mp = vp->v_mount; if (mp) vfs_ref(mp); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); if (vp->v_iflag & VI_DOOMED) { error = EBADF; goto out; } #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: if (mp) vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { struct statfs *buf; long bufsize; int flags; } */ *uap; { return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE, uap->flags)); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, enum uio_seg bufseg, int flags) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, sb; size_t count, maxcount; int vfslocked; int error; maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) sfsp = NULL; else if (bufseg == UIO_USERSPACE) sfsp = *buf; else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP, M_WAITOK); } count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * Set these in case the underlying filesystem * fails to do so. */ sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || (flags & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } if (bufseg == UIO_SYSSPACE) bcopy(sp, sfsp, sizeof(*sp)); else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } } sfsp++; } VFS_UNLOCK_GIANT(vfslocked); count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(td, uap) struct thread *td; struct freebsd4_statfs_args /* { char *path; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(td, uap) struct thread *td; struct freebsd4_fstatfs_args /* { int fd; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int flags; }; #endif int freebsd4_getfsstat(td, uap) struct thread *td; register struct freebsd4_getfsstat_args /* { struct ostatfs *buf; long bufsize; int flags; } */ *uap; { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags); if (size > 0) { count = td->td_retval[0]; sp = buf; while (count > 0 && error == 0) { cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_TEMP); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(td, uap) struct thread *td; struct freebsd4_fhstatfs_args /* { struct fhandle *u_fhp; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void cvtstatfs(nsp, osp) struct statfs *nsp; struct ostatfs *osp; { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int fchdir(td, uap) struct thread *td; struct fchdir_args /* { int fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, vp, ARG_VNODE1); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { int tvfslocked; if (vfs_busy(mp, 0, 0, td)) continue; tvfslocked = VFS_LOCK_GIANT(mp); error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td); vfs_unbusy(mp, td); if (error) { VFS_UNLOCK_GIANT(tvfslocked); break; } vput(vp); VFS_UNLOCK_GIANT(vfslocked); vp = tdp; vfslocked = tvfslocked; } if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); FILEDESC_XLOCK(fdp); vpold = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vpold->v_mount); vrele(vpold); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int chdir(td, uap) struct thread *td; struct chdir_args /* { char *path; } */ *uap; { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; struct vnode *vp; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); FILEDESC_XLOCK(fdp); vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int chroot(td, uap) struct thread *td; struct chroot_args /* { char *path; } */ *uap; { int error; struct nameidata nd; int vfslocked; error = priv_check(td, PRIV_VFS_CHROOT); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) goto error; vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) goto e_vunlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp))) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0, td); error = change_root(nd.ni_vp, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); e_vunlock: vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error: NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(vp, td) struct vnode *vp; struct thread *td; { int error; ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error) return (error); #endif error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); return (error); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_vnode_check_chroot() to * authorize this operation. */ int change_root(vp, td) struct vnode *vp; struct thread *td; { struct filedesc *fdp; struct vnode *oldvp; int vfslocked; int error; VFS_ASSERT_GIANT(vp->v_mount); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error) { FILEDESC_XUNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; fdp->fd_rdir = vp; VREF(fdp->fd_rdir); if (!fdp->fd_jdir) { fdp->fd_jdir = vp; VREF(fdp->fd_jdir); } FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(oldvp->v_mount); vrele(oldvp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { char *path; int flags; int mode; } */ *uap; { return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode); } int kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, flags); AUDIT_ARG(mode, mode); if ((flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(flags); error = falloc(td, &nfp, &indx); if (error) return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { fdrop(fp, td); td->td_retval[0] = indx; return (0); } /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; fdrop(fp, td); return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ fdclose(fdp, fp, indx, td); fdrop(fp, td); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; fp->f_vnode = vp; /* XXX Does devfs need this? */ /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); fp->f_seqcount = 1; finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops); } VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; atomic_set_int(&fp->f_flag, FHASLOCK); } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_SETATTR(vp, &vat, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } VFS_UNLOCK_GIANT(vfslocked); /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: VFS_UNLOCK_GIANT(vfslocked); fdclose(fdp, fp, indx, td); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { char *path; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif int mknod(td, uap) struct thread *td; register struct mknod_args /* { char *path; int mode; int dev; } */ *uap; { return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode, int dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); AUDIT_ARG(dev, dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); break; case S_IFMT: error = priv_check(td, PRIV_VFS_MKNOD_BAD); break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; default: error = EINVAL; break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } else { VATTR_NULL(&vattr); FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { char *path; int mode; } */ *uap; { return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int link(td, uap) struct thread *td; register struct link_args /* { char *path; char *link; } */ *uap; { int error; error = kern_link(td, uap->path, uap->link, UIO_USERSPACE); return (error); } static int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred, td); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error) return (error); } return (0); } int kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct vnode *vp; struct mount *mp; struct nameidata nd; int vfslocked; int lvfslocked; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2, segflg, link, td); if ((error = namei(&nd)) == 0) { lvfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; - } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) + } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = can_hardlink(vp, td, td->td_ucred); if (error == 0) #ifdef MAC error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error == 0) #endif error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0, td); vput(nd.ni_dvp); } NDFREE(&nd, NDF_ONLY_PNBUF); VFS_UNLOCK_GIANT(lvfslocked); } vrele(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int symlink(td, uap) struct thread *td; register struct symlink_args /* { char *path; char *link; } */ *uap; { return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; int error; struct nameidata nd; int vfslocked; if (segflg == UIO_SYSSPACE) { syspath = path; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0) goto out; } AUDIT_ARG(text, syspath); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, link, td); if ((error = namei(&nd)) != 0) goto out; vfslocked = NDHASGIANT(&nd); if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out2; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { char *path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int unlink(td, uap) struct thread *td; struct unlink_args /* { char *path; } */ *uap; { int error; error = kern_unlink(td, uap->path, UIO_USERSPACE); return (error); } int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error == EINVAL ? EPERM : error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ *uap; { struct ucred *cred = td->td_ucred; struct file *fp; struct vnode *vp; struct vattr vattr; off_t offset; int error, noneg; int vfslocked; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); return (ESPIPE); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); noneg = (vp->v_type != VCHR); offset = uap->offset; switch (uap->whence) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += fp->f_offset; break; case L_XTND: - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); if (error) break; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; drop: fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { int fd; long offset; int whence; } */ *uap; { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ nuap; nuap.fd = uap->fd; nuap.offset = uap->offset; nuap.whence = uap->whence; return (lseek(td, &nuap)); } #endif /* COMPAT_43 */ /* Version with the 'pad' argument */ int freebsd6_lseek(td, uap) struct thread *td; register struct freebsd6_lseek_args *uap; { struct lseek_args ouap; ouap.fd = uap->fd; ouap.offset = uap->offset; ouap.whence = uap->whence; return (lseek(td, &ouap)); } /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, flags); if (error) return (error); #endif if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { char *path; int flags; } */ *uap; { return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct ucred *cred, *tmpcred; register struct vnode *vp; struct nameidata nd; int vfslocked; int error; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. */ cred = td->td_ucred; tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) goto out1; vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_access(vp, flags, tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); out1: td->td_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { char *path; int flags; } */ *uap; { return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct nameidata nd; struct vnode *vp; int vfslocked; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; vfslocked = NDHASGIANT(&nd); error = vn_access(vp, flags, td->td_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(td, uap) struct thread *td; register struct ostat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(td, uap) struct thread *td; register struct olstat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif int stat(td, uap) struct thread *td; register struct stat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { struct nameidata nd; struct stat sb; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("stat(%d): %s\n", vfslocked, path); if (error) return (error); *sbp = sb; return (0); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int lstat(td, uap) struct thread *td; register struct lstat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { struct vnode *vp; struct stat sb; struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); *sbp = sb; return (0); } /* * Implementation of the NetBSD [l]stat() functions. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { bzero(nsb, sizeof *nsb); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtimespec = sb->st_birthtimespec; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif int nstat(td, uap) struct thread *td; register struct nstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { char *path; int name; } */ *uap; { return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name)); } int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name) { struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* If asynchronous I/O is available, it works for all files. */ if (name == _PC_ASYNC_IO) td->td_retval[0] = async_io_version; else error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif int readlink(td, uap) struct thread *td; register struct readlink_args /* { char *path; char *buf; int count; } */ *uap; { return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, int count) { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #endif if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); } vput(vp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_flags = flags; #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif int chflags(td, uap) struct thread *td; register struct chflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Same as chflags() but doesn't follow symlinks. */ int lchflags(td, uap) struct thread *td; register struct lchflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { int fd; int flags; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(fflags, uap->flags); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT - vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfflags(td, fp->f_vnode, uap->flags); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(td->td_ucred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int chmod(td, uap) struct thread *td; register struct chmod_args /* { char *path; int mode; } */ *uap; { return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { char *path; int mode; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, (mode_t)uap->mode); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, uap->mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { int fd; int mode; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(mode, uap->mode); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT - vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfmode(td, fp->f_vnode, uap->mode); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(td->td_ucred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int chown(td, uap) struct thread *td; register struct chown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(owner, uid, gid); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int lchown(td, uap) struct thread *td; register struct lchown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(owner, uid, gid); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int fchown(td, uap) struct thread *td; register struct fchown_args /* { int fd; int uid; int gid; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(owner, uap->uid, uap->gid); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT - vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfown(td, fp->f_vnode, uap->uid, uap->gid); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tvpseg, tsp) const struct timeval *usrtvp; enum uio_seg tvpseg; struct timespec *tsp; { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, numtimes, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int numtimes; int nullflag; { int error, setbirthtime; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); setbirthtime = 0; if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int utimes(td, uap) struct thread *td; register struct utimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int futimes(td, uap) struct thread *td; register struct futimes_args /* { int fd; struct timeval *tptr; } */ *uap; { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, fd); if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT - vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int truncate(td, uap) struct thread *td; register struct truncate_args /* { char *path; int pad; off_t length; } */ *uap; { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); } vput(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { char *path; long length; } */ *uap; { struct truncate_args /* { char *path; int pad; off_t length; } */ nuap; nuap.path = uap->path; nuap.length = uap->length; return (truncate(td, &nuap)); } #endif /* COMPAT_43 */ /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { struct truncate_args ouap; ouap.path = uap->path; ouap.length = uap->length; return (truncate(td, &ouap)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { struct ftruncate_args ouap; ouap.fd = uap->fd; ouap.length = uap->length; return (ftruncate(td, &ouap)); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int fsync(td, uap) struct thread *td; struct fsync_args /* { int fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, vp, ARG_VNODE1); if (vp->v_object != NULL) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int rename(td, uap) struct thread *td; register struct rename_args /* { char *from; char *to; } */ *uap; { return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE)); } int kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int tvfslocked; int fvfslocked; int error; bwillwrite(); #ifdef MAC NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, from, td); #else NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, from, td); #endif if ((error = namei(&fromnd)) != 0) return (error); fvfslocked = NDHASGIANT(&fromnd); tvfslocked = 0; #ifdef MAC error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0, td); if (fromnd.ni_dvp != fromnd.ni_vp) VOP_UNLOCK(fromnd.ni_vp, 0, td); #endif fvp = fromnd.ni_vp; if (error == 0) error = vn_start_write(fvp, &mp, V_WAIT | PCATCH); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | MPSAFE | AUDITVNODE2, pathseg, to, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); vn_finished_write(mp); goto out1; } tvfslocked = NDHASGIANT(&tond); tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (!error) { VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); VFS_UNLOCK_GIANT(fvfslocked); VFS_UNLOCK_GIANT(tvfslocked); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { char *path; int mode; } */ *uap; { return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int rmdir(td, uap) struct thread *td; struct rmdir_args /* { char *path; } */ *uap; { return (kern_rmdir(td, uap->path, UIO_USERSPACE)); } int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt, vfslocked; long loff; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } #endif # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = uap->count; MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = uap->count - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); error = copyout(&loff, uap->basep, sizeof(long)); fdrop(fp, td); td->td_retval[0] = uap->count - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; int vfslocked; long loff; int error, eofflag; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); error = EINVAL; goto fail; } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; - /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + /* vn_lock(vp, LK_SHARED | LK_RETRY); */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, vp, ARG_VNODE1); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); goto fail; } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); if (uap->basep != NULL) { error = copyout(&loff, uap->basep, sizeof(long)); } td->td_retval[0] = uap->count - auio.uio_resid; fail: fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { int fd; char *buf; u_int count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (getdirentries(td, &ap)); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { int newmask; } */ *uap; { register struct filedesc *fdp; FILEDESC_XLOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_XUNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int revoke(td, uap) struct thread *td; register struct revoke_args /* { char *path; } */ *uap; { struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); if (error) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error) goto out; } if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Convert a user file descriptor to a kernel file entry. * A reference on the file entry is held upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { int error; struct file *fp; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_SLOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; error = EINVAL; } else { fhold(fp); error = 0; } FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int lgetfh(td, uap) struct thread *td; register struct lgetfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { const struct fhandle *u_fhp; int flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int vfslocked; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error) return (error); fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) goto out; /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, mode); if (error) goto bad; #endif if (mode) { error = VOP_ACCESS(vp, mode, td->td_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); goto out; } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */ #ifdef MAC /* * We don't yet have fp->f_cred, so use td->td_ucred, which * should be right. */ error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp); if (error == 0) { #endif VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, td->td_ucred, td); #ifdef MAC } #endif vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL); if (error) goto bad; if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; nfp->f_vnode = vp; finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops); if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ fdclose(fdp, fp, indx, td); /* * release our private reference */ fdrop(fp, td); goto out; } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); atomic_set_int(&fp->f_flag, FHASLOCK); } VOP_UNLOCK(vp, 0, td); fdrop(fp, td); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = indx; return (0); bad: vput(vp); out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { struct fhandle *u_fhp; struct stat *sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error) return (error); error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) { vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { struct fhandle *u_fhp; struct statfs *buf; } */ *uap; { struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); return (copyout(&sf, uap->buf, sizeof(sf))); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct statfs *sp; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); error = VFS_FHTOVP(mp, &fh.fh_fid, &vp); if (error) { VFS_UNLOCK_GIANT(vfslocked); vfs_rel(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error == 0) *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 175201) +++ head/sys/kern/vfs_vnops.c (revision 175202) @@ -1,1312 +1,1313 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; struct fileops vnops = { .fo_read = vn_read, .fo_write = vn_write, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; int vn_open(ndp, flagp, cmode, fp) struct nameidata *ndp; int *flagp, cmode; struct file *fp; { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp)); } /* * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(ndp, flagp, cmode, cred, fp) struct nameidata *ndp; int *flagp, cmode; struct ucred *cred; struct file *fp; { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int mode, fmode, error; int vfslocked, mpsafe; mpsafe = ndp->ni_cnd.cn_flags & MPSAFE; restart: vfslocked = 0; fmode = *flagp; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); vfslocked = NDHASGIANT(ndp); if (!mpsafe) ndp->ni_cnd.cn_flags &= ~MPSAFE; if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) { #endif VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); #ifdef MAC } #endif vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { VFS_UNLOCK_GIANT(vfslocked); NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ISOPEN | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | MPSAFE | AUDITVNODE1; if ((error = namei(ndp)) != 0) return (error); if (!mpsafe) ndp->ni_cnd.cn_flags &= ~MPSAFE; vfslocked = NDHASGIANT(ndp); vp = ndp->ni_vp; } if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_vnode_check_open(cred, vp, mode); if (error) goto bad; #endif if ((fmode & O_CREAT) == 0) { if (mode & VWRITE) { error = vn_writechk(vp); if (error) goto bad; } if (mode) { error = VOP_ACCESS(vp, mode, cred, td); if (error) goto bad; } } if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) goto bad; if (fmode & FWRITE) vp->v_writecount++; *flagp = fmode; ASSERT_VOP_ELOCKED(vp, "vn_open_cred"); if (!mpsafe) VFS_UNLOCK_GIANT(vfslocked); return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); *flagp = fmode; ndp->ni_vp = NULL; return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(vp) register struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (vp->v_vflag & VV_TEXT) return (ETXTBSY); return (0); } /* * Vnode close call */ int vn_close(vp, flags, file_cred, td) register struct vnode *vp; int flags; struct ucred *file_cred; struct thread *td; { struct mount *mp; int error; VFS_ASSERT_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (flags & FWRITE) { VNASSERT(vp->v_writecount > 0, vp, ("vn_close: negative writecount")); vp->v_writecount--; } error = VOP_CLOSE(vp, flags, file_cred, td); vput(vp); vn_finished_write(mp); return (error); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ fp->f_seqcount += howmany(uio->uio_resid, 16384); if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return (fp->f_seqcount << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; int len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; int *aresid; struct thread *td; { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; int error; VFS_ASSERT_GIANT(vp->v_mount); if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { /* * XXX This should be LK_SHARED but I don't trust VFS * enough to leave it like that until it has been * reviewed further. */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred) cred = file_cred; else cred = active_cred; if (rw == UIO_READ) error = VOP_READ(vp, &auio, ioflg, cred); else error = VOP_WRITE(vp, &auio, ioflg, cred); } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { if (rw == UIO_WRITE && vp->v_type != VCHR) vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; size_t len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; size_t *aresid; struct thread *td; { int error = 0; int iaresid; VFS_ASSERT_GIANT(vp->v_mount); do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; uio_yield(); } while (len); if (aresid) *aresid = len + iaresid; return (error); } /* * File table vnode read routine. */ static int vn_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct vnode *vp; int error, ioflag; struct mtx *mtxp; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); mtxp = NULL; vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; vfslocked = VFS_LOCK_GIANT(vp->v_mount); VOP_LEASE(vp, td, fp->f_cred, LEASE_READ); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); while(fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vnread offlock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); uio->uio_offset = fp->f_offset; } else - vn_lock(vp, LK_SHARED | LK_RETRY, td); + vn_lock(vp, LK_SHARED | LK_RETRY); ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; mtx_lock(mtxp); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; mtx_unlock(mtxp); } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode write routine. */ static int vn_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct vnode *vp; struct mount *mp; int error, ioflag; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VCHR) vn_finished_write(mp); unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table truncate routine. */ static int vn_truncate(fp, length, active_cred, td) struct file *fp; off_t length; struct ucred *active_cred; struct thread *td; { struct vattr vattr; struct mount *mp; struct vnode *vp; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) { VFS_UNLOCK_GIANT(vfslocked); return (error); } VOP_LEASE(vp, td, active_cred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_writechk(vp); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } out: VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode stat routine. */ static int vn_statfile(fp, sb, active_cred, td) struct file *fp; struct stat *sb; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vn_stat(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat a vnode; implementation for the stat syscall */ int vn_stat(vp, sb, active_cred, file_cred, td) struct vnode *vp; register struct stat *sb; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; { struct vattr vattr; register struct vattr *vap; int error; u_short mode; #ifdef MAC error = mac_vnode_check_stat(active_cred, file_cred, vp); if (error) return (error); #endif vap = &vattr; error = VOP_GETATTR(vp, vap, active_cred, td); if (error) return (error); /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; /* This is a cosmetic change, symlinks do not have a mode. */ if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) sb->st_mode &= ~ACCESSPERMS; /* 0000 */ else sb->st_mode |= ACCESSPERMS; /* 0777 */ break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); }; sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atimespec = vap->va_atime; sb->st_mtimespec = vap->va_mtime; sb->st_ctimespec = vap->va_ctime; sb->st_birthtimespec = vap->va_birthtime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Default to PAGE_SIZE after much discussion. * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct. */ sb->st_blksize = PAGE_SIZE; sb->st_flags = vap->va_flags; if (priv_check(td, PRIV_VFS_GENERATION)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; sb->st_blocks = vap->va_bytes / S_BLKSIZE; return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(fp, com, data, active_cred, td) struct file *fp; u_long com; void *data; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; struct vattr vattr; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = ENOTTY; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(vp, &vattr, active_cred, td); VOP_UNLOCK(vp, 0, td); if (!error) *(int *)data = vattr.va_size - fp->f_offset; } if (com == FIONBIO || com == FIOASYNC) /* XXX */ error = 0; else error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td); break; default: break; } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode poll routine. */ static int vn_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct vnode *vp; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); #ifdef MAC - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp, 0, td); if (!error) #endif error = VOP_POLL(vp, events, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Check that the vnode is still valid, and if so * acquire requested lock. */ int -_vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line) +_vn_lock(struct vnode *vp, int flags, char *file, int line) { int error; do { if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) && vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); return (ENOENT); } /* * Just polling to check validity. */ if ((flags & LK_TYPE_MASK) == 0) { VI_UNLOCK(vp); return (0); } /* * lockmgr drops interlock before it will return for * any reason. So force the code above to relock it. */ - error = VOP_LOCK1(vp, flags | LK_INTERLOCK, td, file, line); + error = VOP_LOCK1(vp, flags | LK_INTERLOCK, curthread, file, + line); flags &= ~LK_INTERLOCK; KASSERT((flags & LK_RETRY) == 0 || error == 0, ("LK_RETRY set with incompatible flags %d\n", flags)); /* * Callers specify LK_RETRY if they wish to get dead vnodes. * If RETRY is not set, we return ENOENT instead. */ if (error == 0 && vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) { - VOP_UNLOCK(vp, 0, td); + VOP_UNLOCK(vp, 0, curthread); error = ENOENT; break; } } while (flags & LK_RETRY && error != 0); return (error); } /* * File table vnode close routine. */ static int vn_closefile(fp, td) struct file *fp; struct thread *td; { struct vnode *vp; struct flock lf; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); } fp->f_ops = &badfileops; error = vn_close(vp, fp->f_flag, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ int vn_start_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); /* * Check on status of suspension. */ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH), "suspfs", 0); if (error) goto unlock; } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_write_suspend_wait(vp, mp, flags) struct vnode *vp; struct mount *mp; int flags; { int error; if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if (mp == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) { MNT_REL(mp); MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); vfs_rel(mp); return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_REL(mp); MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(mp) struct mount *mp; { if (mp == NULL) return; MNT_ILOCK(mp); mp->mnt_writeopcount--; if (mp->mnt_writeopcount < 0) panic("vn_finished_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_writeopcount <= 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(mp) struct mount *mp; { if (mp == NULL) return; MNT_ILOCK(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(mp) struct mount *mp; { struct thread *td = curthread; int error; MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPEND) { MNT_IUNLOCK(mp); return (0); } mp->mnt_kern_flag |= MNTK_SUSPEND; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0) vfs_write_resume(mp); return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(mp) struct mount *mp; { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); } MNT_IUNLOCK(mp); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = VOP_KQFILTER(fp->f_vnode, kn); VFS_UNLOCK_GIANT(vfslocked); return error; } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0, td); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } Index: head/sys/nfs4client/nfs4_vnops.c =================================================================== --- head/sys/nfs4client/nfs4_vnops.c (revision 175201) +++ head/sys/nfs4client/nfs4_vnops.c (revision 175202) @@ -1,2878 +1,2878 @@ /* $Id: nfs_vnops.c,v 1.45 2003/11/05 14:59:02 rees Exp $ */ /*- * copyright (c) 2003 * the regents of the university of michigan * all rights reserved * * permission is granted to use, copy, create derivative works and redistribute * this software and such derivative works for any purpose, so long as the name * of the university of michigan is not used in any advertising or publicity * pertaining to the use or distribution of this software without specific, * written prior authorization. if the above copyright notice or any other * identification of the university of michigan is included in any copy of any * portion of this software, then the disclaimer below must also be included. * * this software is provided as is, without representation from the university * of michigan as to its fitness for any purpose, and without warranty by the * university of michigan of any kind, either express or implied, including * without limitation the implied warranties of merchantability and fitness for * a particular purpose. the regents of the university of michigan shall not be * liable for any damages, including special, indirect, incidental, or * consequential damages, with respect to any claim arising out of or in * connection with the use of the software, even if it has been or is hereafter * advised of the possibility of such damages. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); /* * vnode op calls for Sun NFS version 2 and 3 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* NFSv4 */ #include #include /* Defs */ #define TRUE 1 #define FALSE 0 /* * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these * calls are not in getblk() and brelse() so that they would not be necessary * here. */ #ifndef B_VMIO #define vfs_busy_pages(bp, f) #endif static int nfs4_flush(struct vnode *, int, struct thread *, int); static int nfs4_setattrrpc(struct vnode *, struct vattr *, struct ucred *, struct thread *); static int nfs4_closerpc(struct vnode *, struct ucred *, struct thread *, int); static vop_lookup_t nfs4_lookup; static vop_create_t nfs4_create; static vop_mknod_t nfs4_mknod; static vop_open_t nfs4_open; static vop_close_t nfs4_close; static vop_access_t nfs4_access; static vop_getattr_t nfs4_getattr; static vop_setattr_t nfs4_setattr; static vop_read_t nfs4_read; static vop_fsync_t nfs4_fsync; static vop_remove_t nfs4_remove; static vop_link_t nfs4_link; static vop_rename_t nfs4_rename; static vop_mkdir_t nfs4_mkdir; static vop_rmdir_t nfs4_rmdir; static vop_symlink_t nfs4_symlink; static vop_readdir_t nfs4_readdir; static vop_strategy_t nfs4_strategy; static int nfs4_lookitup(struct vnode *, const char *, int, struct ucred *, struct thread *, struct nfsnode **); static int nfs4_sillyrename(struct vnode *, struct vnode *, struct componentname *); static vop_readlink_t nfs4_readlink; static vop_print_t nfs4_print; static vop_advlock_t nfs4_advlock; /* * Global vfs data structures for nfs */ struct vop_vector nfs4_vnodeops = { .vop_default = &default_vnodeops, .vop_access = nfs4_access, .vop_advlock = nfs4_advlock, .vop_close = nfs4_close, .vop_create = nfs4_create, .vop_fsync = nfs4_fsync, .vop_getattr = nfs4_getattr, .vop_getpages = nfs_getpages, .vop_putpages = nfs_putpages, .vop_inactive = nfs_inactive, .vop_lease = VOP_NULL, .vop_link = nfs4_link, .vop_lookup = nfs4_lookup, .vop_mkdir = nfs4_mkdir, .vop_mknod = nfs4_mknod, .vop_open = nfs4_open, .vop_print = nfs4_print, .vop_read = nfs4_read, .vop_readdir = nfs4_readdir, .vop_readlink = nfs4_readlink, .vop_reclaim = nfs_reclaim, .vop_remove = nfs4_remove, .vop_rename = nfs4_rename, .vop_rmdir = nfs4_rmdir, .vop_setattr = nfs4_setattr, .vop_strategy = nfs4_strategy, .vop_symlink = nfs4_symlink, .vop_write = nfs_write, }; static int nfs4_removerpc(struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct thread *td); static int nfs4_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td); static int nfs4_renameit(struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp); static int nfs4_openrpc(struct vnode *, struct vnode **, struct componentname *, int, struct vattr *); static int nfs4_open_confirm(struct vnode *vp, struct nfs4_compound *cpp, struct nfs4_oparg_open *openap, struct nfs4_oparg_getfh *gfh, struct ucred *cred, struct thread *td); static int nfs4_createrpc(struct vnode *, struct vnode **, struct componentname *, nfstype, struct vattr *, char *); /* * Global variables */ struct nfs4_lowner nfs4_masterlowner; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) SYSCTL_DECL(_vfs_nfs4); static int nfs4_access_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs4, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfs4_access_cache_timeout, 0, "NFS ACCESS cache timeout"); #if 0 static int nfsv3_commit_on_close = 0; SYSCTL_INT(_vfs_nfs4, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW, &nfsv3_commit_on_close, 0, "write+commit on close, else only write"); SYSCTL_INT(_vfs_nfs4, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count"); SYSCTL_INT(_vfs_nfs4, OID_AUTO, access_cache_misses, CTLFLAG_RD, &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count"); #endif #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \ | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) static int nfs4_v3_access_otw(struct vnode *vp, int wmode, struct thread *td, struct ucred *cred) { const int v3 = 1; u_int32_t *tl; int error = 0, attrflag; return (0); struct mbuf *mreq, *mrep = NULL, *md, *mb; caddr_t bpos, dpos; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_ACCESS]++; mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, td, cred); nfsm_postop_attr(vp, attrflag); if (!error) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; np->n_modestamp = time_second; } m_freem(mrep); nfsmout: return error; } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs4_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int error = 0; u_int32_t mode, wmode; int v3 = NFS_ISV3(vp); /* v3 \in v4 */ struct nfsnode *np = VTONFS(vp); caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; struct nfs4_oparg_access acc; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ /* XXX Disable this for now; needs fixing of _access_otw() */ if (0 && v3) { if (ap->a_mode & VREAD) mode = NFSV3ACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } else { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } /* XXX safety belt, only make blanket request if caching */ if (nfs4_access_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ if (time_second < np->n_modestamp + nfs4_access_cache_timeout && ap->a_cred->cr_uid == np->n_modeuid && (np->n_mode & mode) == mode) { nfsstats.accesscache_hits++; } else { /* * Either a no, or a don't know. Go to the wire. */ nfsstats.accesscache_misses++; error = nfs4_v3_access_otw(vp, wmode, ap->a_td, ap->a_cred); if (error == 0) { if ((np->n_mode & mode) != mode) error = EACCES; } } return (error); } /* XXX use generic access code here? */ mode = ap->a_mode & VREAD ? NFSV4ACCESS_READ : 0; if (vp->v_type == VDIR) { if (ap->a_mode & VWRITE) mode |= NFSV4ACCESS_MODIFY | NFSV4ACCESS_EXTEND | NFSV4ACCESS_DELETE; if (ap->a_mode & VEXEC) mode |= NFSV4ACCESS_LOOKUP; } else { if (ap->a_mode & VWRITE) mode |= NFSV4ACCESS_MODIFY | NFSV4ACCESS_EXTEND; if (ap->a_mode & VEXEC) mode |= NFSV4ACCESS_EXECUTE; } nfs_v4initcompound(&cp); acc.mode = mode; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_access()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_access(&cp, &acc); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_access(&cp, &acc); if ((acc.rmode & mode) != mode) error = EACCES; nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } static int nfs4_openrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int flags, struct vattr *vap) { struct vnode *vp = *vpp; struct nfs4_oparg_getattr getattr; struct nfs4_oparg_getfh getfh; struct nfs4_oparg_open opena; struct nfs4_compound cp; caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; struct nfs4_fctx xfc, *fcp; struct nfsnode *np; if (vp == NULL) { /* Create a new file */ np = NULL; fcp = &xfc; bzero(fcp, sizeof(*fcp)); } else { np = VTONFS(vp); fcp = flags & FWRITE ? &np->n_wfc : &np->n_rfc; } /* * Since we are currently only one lockowner; we only open the * file once each for reading and writing. */ if (fcp->refcnt++ != 0) { *vpp = vp; /*printf("not opening %s\n", np->n_name != NULL ? np->n_name : "");*/ return (0); } fcp->lop = &nfs4_masterlowner; fcp->np = np; nfs_v4initcompound(&cp); cp.nmp = VFSTONFS(dvp->v_mount); opena.ctype = NCLNULL; opena.flags = flags; opena.vap = vap; opena.fcp = fcp; /* For lockowner */ opena.cnp = cnp; getattr.bm = &nfsv4_getattrbm; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_openrpc()"); nfsm_v4build_putfh(&cp, dvp); nfsm_v4build_open(&cp, &opena); nfsm_v4build_getattr(&cp, &getattr); nfsm_v4build_getfh(&cp, &getfh); nfsm_v4build_finalize(&cp); nfsm_request(vp != NULL ? vp : dvp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_open(&cp, &opena); nfsm_v4dissect_getattr(&cp, &getattr); nfsm_v4dissect_getfh(&cp, &getfh); error = nfs_v4postop(&cp, error); if (opena.rflags & NFSV4OPENRES_CONFIRM) { error = nfs4_open_confirm(vp ? vp : dvp, &cp, &opena, &getfh, cred, td); if (error != 0) goto nfsmout; } if (vp == NULL) { /* New file */ error = nfs_nget(dvp->v_mount, &getfh.fh_val, getfh.fh_len, &np, LK_EXCLUSIVE); if (error != 0) goto nfsmout; vp = NFSTOV(np); np->n_dvp = dvp; np->n_namelen = cnp->cn_namelen; /* XXX memory leaks on these; track! */ if (np->n_name != NULL) FREE(np->n_name, M_NFSREQ); MALLOC(np->n_name, u_char *, np->n_namelen + 1, M_NFSREQ, M_WAITOK); bcopy(cnp->cn_nameptr, np->n_name, np->n_namelen); np->n_name[np->n_namelen] = '\0'; if (flags & FWRITE) np->n_wfc = *fcp; else np->n_rfc = *fcp; /*printf("opened new file %s\n", np->n_name);*/ nfs4_vnop_loadattrcache(vp, &getattr.fa, NULL); *vpp = vp; } else { /*printf("openend \"old\" %s\n", np->n_name != NULL ? np->n_name : "");*/ if (flags & O_TRUNC && np->n_size != 0) { struct vattr va; VATTR_NULL(&va); va.va_size = 0; error = nfs4_setattrrpc(vp, &va, cnp->cn_cred, cnp->cn_thread); } np->n_attrstamp = 0; } nfsmout: if (mrep != NULL) m_freem(mrep); return (error); } static int nfs4_open_confirm(struct vnode *vp, struct nfs4_compound *cpp, struct nfs4_oparg_open *openap, struct nfs4_oparg_getfh *gfh, struct ucred *cred, struct thread *td) { caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; nfs_v4initcompound(cpp); cpp->nmp = VFSTONFS(vp->v_mount); mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(cpp, "nfs4_open_confirm()"); nfsm_v4build_putfh_nv(cpp, gfh); nfsm_v4build_open_confirm(cpp, openap); nfsm_v4build_finalize(cpp); nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(cpp); nfsm_v4dissect_putfh(cpp); nfsm_v4dissect_open_confirm(cpp, openap); nfsmout: error = nfs_v4postop(cpp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs4_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); enum vtype vtype = vp->v_type; int mode = ap->a_mode; struct componentname cn; if (vtype != VREG) { if (vtype != VDIR && vtype != VLNK) { #ifdef DIAGNOSTIC printf("open eacces vtyp=%d\n", vp->v_type); #endif return (EACCES); } else return (0); } if (np->n_flag & NCREATED) { np->n_flag &= ~NCREATED; return (0); } cn.cn_nameptr = np->n_name; cn.cn_namelen = np->n_namelen; cn.cn_cred = ap->a_cred; cn.cn_thread = ap->a_td; return (nfs4_openrpc(np->n_dvp, &vp, &cn, mode, NULL)); } static int nfs4_closerpc(struct vnode *vp, struct ucred *cred, struct thread *td, int flags) { caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_fctx *fcp; struct nfs4_compound cp; struct nfsnode *np = VTONFS(vp); fcp = flags & FWRITE ? &np->n_wfc : &np->n_rfc; nfs_v4initcompound(&cp); if (--fcp->refcnt != 0) return (0); /*printf("closing %s\n", np->n_name != NULL ? np->n_name : "");*/ cp.fcp = fcp; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_closerpc()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_close(&cp, fcp); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_close(&cp, fcp); nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs close vnode op * play it safe for now (see comments in v2/v3 nfs_close regarding dirty buffers) */ /* ARGSUSED */ static int nfs4_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); int error = 0; if (vp->v_type != VREG) return (0); if (np->n_flag & NMODIFIED) { error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); np->n_attrstamp = 0; } error = nfs4_closerpc(vp, ap->a_cred, ap->a_td, ap->a_fflag); if (!error && np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } return (error); } /* * nfs getattr call from vfs. */ static int nfs4_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_oparg_getattr ga; struct nfs4_compound cp; /* * Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; /* * First look in the cache. */ if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); nfsstats.rpccnt[NFSPROC_GETATTR]++; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); ga.bm = &nfsv4_getattrbm; nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_getattr()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_getattr(&cp, &ga); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, ap->a_td, ap->a_cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_getattr(&cp, &ga); nfs4_vnop_loadattrcache(vp, &ga.fa, ap->a_vap); nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs setattr call. */ static int nfs4_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags is not supported. */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * We run vnode_pager_setsize() early (why?), * we must set np->n_size now to avoid vinvalbuf * V_SAVE races that might setsize a lower * value. */ tsize = np->n_size; error = nfs_meta_setsize(vp, ap->a_cred, ap->a_td, vap->va_size); if (np->n_flag & NMODIFIED) { if (vap->va_size == 0) error = nfs_vinvalbuf(vp, 0, ap->a_td, 1); else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error) { vnode_pager_setsize(vp, np->n_size); return (error); } } /* * np->n_size has already been set to vap->va_size * in nfs_meta_setsize(). We must set it again since * nfs_loadattrcache() could be called through * nfs_meta_setsize() and could modify np->n_size. */ np->n_vattr.va_size = np->n_size = vap->va_size; }; } else if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG && (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) == EINTR) return (error); if (vap->va_size != VNOVAL && np->n_wfc.refcnt == 0) { /* Have to open the file before we can truncate it */ struct componentname cn; cn.cn_nameptr = np->n_name; cn.cn_namelen = np->n_namelen; cn.cn_cred = ap->a_cred; cn.cn_thread = ap->a_td; error = nfs4_openrpc(np->n_dvp, &vp, &cn, FWRITE, NULL); if (error) return error; np->n_flag |= NTRUNCATE; } error = nfs4_setattrrpc(vp, vap, ap->a_cred, ap->a_td); if (error && vap->va_size != VNOVAL) { np->n_size = np->n_vattr.va_size = tsize; vnode_pager_setsize(vp, np->n_size); } return (error); } /* * Do an nfs setattr rpc. */ static int nfs4_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; struct nfs4_oparg_getattr ga; struct nfsnode *np = VTONFS(vp); struct nfs4_fctx *fcp; nfsstats.rpccnt[NFSPROC_SETATTR]++; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); ga.bm = &nfsv4_getattrbm; fcp = (vap->va_size != VNOVAL) ? &np->n_wfc : NULL; nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_setattrrpc"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_setattr(&cp, vap, fcp); nfsm_v4build_getattr(&cp, &ga); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_setattr(&cp); nfsm_v4dissect_getattr(&cp, &ga); nfs4_vnop_loadattrcache(vp, &ga.fa, NULL); /* TODO: do the settatr and close in a single compound rpc */ if (np->n_flag & NTRUNCATE) { error = nfs4_closerpc(vp, cred, td, FWRITE); np->n_flag &= ~NTRUNCATE; } nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs4_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int isdot, flags = cnp->cn_flags; struct vnode *newvp; struct nfsmount *nmp; caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; long len; nfsfh_t *fhp; struct nfsnode *np; int error = 0, fhsize; struct thread *td = cnp->cn_thread; struct nfs4_compound cp; struct nfs4_oparg_getattr ga, dga; struct nfs4_oparg_lookup l; struct nfs4_oparg_getfh gfh; *vpp = NULLVP; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); isdot = cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) { *vpp = NULLVP; return (error); } if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { struct vattr vattr; newvp = *vpp; if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, td) && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); if (newvp != dvp) vput(newvp); else vrele(newvp); } error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; nfsstats.rpccnt[NFSPROC_LOOKUP]++; len = cnp->cn_namelen; mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); ga.bm = &nfsv4_getattrbm; dga.bm = &nfsv4_getattrbm; nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_lookup()"); nfsm_v4build_putfh(&cp, dvp); nfsm_v4build_getattr(&cp, &dga); if (flags & ISDOTDOT) nfsm_v4build_lookupp(&cp); else if (!isdot) { l.name = cnp->cn_nameptr; l.namelen = len; nfsm_v4build_lookup(&cp, &l); } nfsm_v4build_getattr(&cp, &ga); nfsm_v4build_getfh(&cp, &gfh); nfsm_v4build_finalize(&cp); nfsm_request(dvp, NFSV4PROC_COMPOUND, cnp->cn_thread, cnp->cn_cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_getattr(&cp, &dga); if (flags & ISDOTDOT) nfsm_v4dissect_lookupp(&cp); else if (!isdot) nfsm_v4dissect_lookup(&cp); nfsm_v4dissect_getattr(&cp, &ga); nfsm_v4dissect_getfh(&cp, &gfh); nfs4_vnop_loadattrcache(dvp, &dga.fa, NULL); fhp = &gfh.fh_val; fhsize = gfh.fh_len; /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) return (EISDIR); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) return (error); newvp = NFSTOV(np); nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL); *vpp = newvp; cnp->cn_flags |= SAVENAME; return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); newvp = NFSTOV(np); nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL); } else if (NFS_CMPFH(np, fhp, fhsize)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) return (error); newvp = NFSTOV(np); /* Fill in np used by open. */ np->n_dvp = dvp; np->n_namelen = cnp->cn_namelen; if (np->n_name != NULL) FREE(np->n_name, M_NFSREQ); MALLOC(np->n_name, u_char *, np->n_namelen + 1, M_NFSREQ, M_WAITOK); bcopy(cnp->cn_nameptr, np->n_name, np->n_namelen); np->n_name[np->n_namelen] = '\0'; nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL); } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(dvp, newvp, cnp); } *vpp = newvp; m_freem(mrep); nfsmout: error = nfs_v4postop(&cp, error); if (error) { if (newvp != NULLVP) { vrele(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { if (dvp->v_mount->mnt_flag & MNT_RDONLY) error = EROFS; else error = EJUSTRETURN; } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } return (error); } /* * nfs read call. * Just call nfs_bioread() to do the work. */ static int nfs4_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; switch (vp->v_type) { case VREG: return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); case VDIR: return (EISDIR); default: return (EOPNOTSUPP); } } /* * nfs readlink call */ static int nfs4_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by nfs_doio() from below the buffer cache. */ int nfs4_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; nfsstats.rpccnt[NFSPROC_READLINK]++; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_readlinkrpc()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_readlink(&cp); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_readlink(&cp, uiop); nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs read rpc call * Ditto above */ int nfs4_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfsmount *nmp; int error = 0, len, tsiz; struct nfs4_compound cp; struct nfs4_oparg_read read; struct nfsnode *np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); if (tsiz == 0) return (0); read.uiop = uiop; read.fcp = np->n_rfc.refcnt > 0 ? &np->n_rfc : &np->n_wfc; while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; read.off = uiop->uio_offset; read.maxcnt = len; nfs_v4initcompound(&cp); mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_readrpc()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_read(&cp, &read); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred); if (error != 0) { error = nfs_v4postop(&cp, error); goto nfsmout; } nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_read(&cp, &read); if (read.eof || read.retlen == 0) tsiz = 0; else tsiz -= read.retlen; error = nfs_v4postop(&cp, error); m_freem(mrep); mrep = NULL; } nfsmout: if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs write call */ int nfs4_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred, int *iomode, int *must_commit) { int32_t backup; caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, len, tsiz, wccflag = 1, rlen; struct nfs4_compound cp; struct nfs4_oparg_write write; nfsv4stablehow commit, committed = NSHFILESYNC; caddr_t verf; struct nfsnode *np = VTONFS(vp); #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs: writerpc iovcnt > 1"); #endif *must_commit = 0; tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); if (tsiz == 0) return (0); write.stable = (nfsv4stablehow)*iomode; write.uiop = uiop; write.fcp = &np->n_wfc; while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; write.off = uiop->uio_offset; write.cnt = len; nfs_v4initcompound(&cp); mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_writerpc()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_write(&cp, &write); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred); if (error != 0) { error = nfs_v4postop(&cp, error); goto nfsmout; } nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_write(&cp, &write); rlen = write.retlen; if (rlen == 0) { error = NFSERR_IO; break; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = write.committed; if (committed == NSHFILESYNC || (committed = NSHDATASYNC && commit == NSHUNSTABLE)) committed = commit; verf = (caddr_t)write.wverf; if ((nmp->nm_flag & NFSSTA_HASWRITEVERF) == 0) { bcopy(verf, nmp->nm_verf, NFSX_V4VERF); nmp->nm_flag |= NFSMNT_HASWRITEVERF; } else if (bcmp(verf, nmp->nm_verf, NFSX_V4VERF)) { *must_commit = 1; bcopy(verf, nmp->nm_verf, NFSX_V4VERF); } /* XXX wccflag */ if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime; error = nfs_v4postop(&cp, error); m_freem(mrep); mrep = NULL; if (error) break; tsiz -= len; } nfsmout: if (mrep != NULL) m_freem(mrep); *iomode = committed; if (error) uiop->uio_resid = tsiz; return (error); } /* ARGSUSED */ static int nfs4_mknod(struct vop_mknod_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *newvp = NULL; int error; error = nfs4_createrpc(ap->a_dvp, &newvp, ap->a_cnp, (nfstype)vap->va_type, vap, NULL); /* XXX - is this actually referenced here? */ if (error == 0) { *ap->a_vpp = newvp; vrele(newvp); } return (error); } static int nfs4_createrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, nfstype ftype, struct vattr *vap, char *linktarget) { struct nfsnode *dnp = VTONFS(dvp); struct nfsnode *np = NULL; struct vnode *newvp = NULL; struct nfs4_compound cp; struct nfs4_oparg_create c; struct nfs4_oparg_getattr ga; struct nfs4_oparg_getfh gfh; caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; int error = 0; nfsstats.rpccnt[NFSPROC_CREATE]++; mreq = nfsm_reqhead(dvp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); bzero(&c, sizeof(c)); bzero(&ga, sizeof(ga)); c.type = ftype; c.vap = vap; c.linktext = linktarget; c.name = cnp->cn_nameptr; c.namelen = cnp->cn_namelen; ga.bm = &nfsv4_getattrbm; nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_createrpc()"); nfsm_v4build_putfh(&cp, dvp); nfsm_v4build_create(&cp, &c); nfsm_v4build_getattr(&cp, &ga); nfsm_v4build_getfh(&cp, &gfh); nfsm_v4build_finalize(&cp); nfsm_request(dvp, NFSV4PROC_COMPOUND, cnp->cn_thread, cnp->cn_cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_create(&cp, &c); nfsm_v4dissect_getattr(&cp, &ga); nfsm_v4dissect_getfh(&cp, &gfh); error = nfs_nget(dvp->v_mount, &gfh.fh_val, gfh.fh_len, &np, LK_EXCLUSIVE); if (error != 0) goto nfsmout; newvp = NFSTOV(np); nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL); if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); dnp->n_flag |= NMODIFIED; dnp->n_attrstamp = 0; nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); /* XXX */ /*FREE(cnp->cn_pnbuf, M_NAMEI);*/ if (error != 0 && newvp != NULL) vput(newvp); else if (error == 0) *vpp = newvp; return (error); } static int nfs4_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td) { struct nfsnode *fnp = VTONFS(fdvp), *tnp = VTONFS(tdvp); caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; struct nfs4_oparg_rename r; int error = 0; nfsstats.rpccnt[NFSPROC_RENAME]++; r.fname = fnameptr; r.fnamelen = fnamelen; r.tname = tnameptr; r.tnamelen = tnamelen; nfs_v4initcompound(&cp); mreq = nfsm_reqhead(fdvp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_renamerpc()"); nfsm_v4build_putfh(&cp, fdvp); nfsm_v4build_savefh(&cp); nfsm_v4build_putfh(&cp, tdvp); nfsm_v4build_rename(&cp, &r); nfsm_v4build_finalize(&cp); nfsm_request(fdvp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_savefh(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_rename(&cp); /* XXX should this always be performed? */ fnp->n_flag |= NMODIFIED; tnp->n_flag |= NMODIFIED; fnp->n_attrstamp = tnp->n_attrstamp = 0; nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs file create call */ static int nfs4_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct nfsnode *dnp = VTONFS(dvp); struct componentname *cnp = ap->a_cnp; struct vnode *newvp = NULL; int error = 0, fmode = (O_CREAT | FREAD | FWRITE); struct vattr vattr; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) return (error); if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; error = nfs4_openrpc(dvp, &newvp, cnp, fmode, vap); if (error != 0) goto out; VTONFS(newvp)->n_flag |= NCREATED; if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; dnp->n_flag |= NMODIFIED; dnp->n_attrstamp = 0; /* XXX; wccflag */ out: return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs4_sillyrename() to set it up * else * do the remove rpc */ static int nfs4_remove(struct vop_remove_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; #ifndef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("nfs4_remove: no name"); if (vrefcnt(vp) < 1) panic("nfs4_remove: bad v_usecount"); #endif if (vp->v_type == VDIR) error = EPERM; else if (vrefcnt(vp) == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_thread) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1); /* Do the rpc */ if (error != EINTR) error = nfs4_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs4_sillyrename(dvp, vp, cnp); np->n_attrstamp = 0; return (error); } /* * nfs file remove rpc called from nfs_inactive */ int nfs4_removeit(struct sillyrename *sp) { /* * Make sure that the directory vnode is still valid. * XXX we should lock sp->s_dvp here. */ if (sp->s_dvp->v_type == VBAD) return (0); return (nfs4_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, NULL)); } /* * Nfs remove rpc, called from nfs4_remove() and nfs4_removeit(). */ static int nfs4_removerpc(struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct thread *td) { caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; nfsstats.rpccnt[NFSPROC_REMOVE]++; mreq = nfsm_reqhead(dvp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_removerpc()"); nfsm_v4build_putfh(&cp, dvp); nfsm_v4build_remove(&cp, name, namelen); nfsm_v4build_finalize(&cp); nfsm_request(dvp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_remove(&cp); nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); VTONFS(dvp)->n_flag |= NMODIFIED; VTONFS(dvp)->n_attrstamp = 0; /* XXX wccflag */ return (error); } /* * nfs file rename call */ static int nfs4_rename(struct vop_rename_args *ap) { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; int error; #ifndef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("nfs4_rename: no name"); #endif /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (fvp == tvp) { printf("nfs4_rename: fvp == tvp (can't happen)\n"); error = 0; goto out; } - if ((error = vn_lock(fvp, LK_EXCLUSIVE, fcnp->cn_thread)) != 0) + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto out; /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. */ VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread); VOP_UNLOCK(fvp, 0, fcnp->cn_thread); if (tvp) VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread); /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs4_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs4_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_thread); if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs4_remove() above */ static int nfs4_renameit(struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp) { return (nfs4_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread)); } /* * nfs hard link create call */ static int nfs4_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; struct nfs4_oparg_link l; if (vp->v_mount != tdvp->v_mount) { return (EXDEV); } /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread); nfsstats.rpccnt[NFSPROC_LINK]++; l.name = cnp->cn_nameptr; l.namelen = cnp->cn_namelen; nfs_v4initcompound(&cp); mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_link()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_savefh(&cp); nfsm_v4build_putfh(&cp, tdvp); nfsm_v4build_link(&cp, &l); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, cnp->cn_thread, cnp->cn_cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_savefh(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_link(&cp); VTONFS(tdvp)->n_flag |= NMODIFIED; VTONFS(vp)->n_attrstamp = 0; VTONFS(tdvp)->n_attrstamp = 0; nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); return (error); } /* * nfs symbolic link create call */ static int nfs4_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; int error = 0; struct vnode *newvp = NULL; nfsstats.rpccnt[NFSPROC_SYMLINK]++; error = nfs4_createrpc(ap->a_dvp, &newvp, ap->a_cnp, NFLNK, ap->a_vap, ap->a_target); if (error != 0 && newvp != NULL) vput(newvp); else if (error == 0) *ap->a_vpp = newvp; VTONFS(dvp)->n_flag |= NMODIFIED; VTONFS(dvp)->n_attrstamp = 0; /* XXX wccflags */ return (error); } /* * nfs make dir call */ static int nfs4_mkdir(struct vop_mkdir_args *ap) { return (nfs4_createrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, NFDIR, ap->a_vap, NULL)); } /* * nfs remove directory call */ static int nfs4_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct nfsnode *dnp = VTONFS(dvp); struct componentname *cnp = ap->a_cnp; int error = 0; if (dvp == vp) return (EINVAL); error = (nfs4_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, NULL)); if (error) return (error); dnp->n_flag |= NMODIFIED; dnp->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); return (error); } /* * nfs readdir call */ static int nfs4_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct uio *uio = ap->a_uio; int tresid, error; struct vattr vattr; if (vp->v_type != VDIR) return (EPERM); /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0 && !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { nfsstats.direofcache_hits++; return (0); } } /* * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; return (error); } static u_char fty_to_dty[] = { DT_UNKNOWN, /* NFNON */ DT_REG, /* NFREG */ DT_DIR, /* NFDIR */ DT_BLK, /* NFBLK */ DT_CHR, /* NFCHR */ DT_LNK, /* NFLNK */ DT_SOCK, /* NFSOCK */ DT_FIFO, /* NFFIFO */ DT_UNKNOWN, /* NFATTRDIT */ DT_UNKNOWN, /* NFNAMEDATTR */ DT_UNKNOWN, /* NFBAD */ }; /* * Readdir rpc call. * Called from below the buffer cache by nfs_doio(). */ int nfs4_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { int len, left; struct dirent *dp = NULL; u_int32_t *tl; caddr_t p; uint64_t *cookiep; caddr_t bpos, dpos; struct mbuf *mreq, *mrep = NULL, *md, *mb; uint64_t cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; struct nfs4_compound cp; struct nfs4_oparg_readdir readdir; struct nfsv4_fattr fattr; u_int fty; #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif /* * If there is no cookie, assume directory was stale. */ cookiep = nfs4_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* Generate fake entries for "." and ".." */ while (cookie < 2 && bigenough) { cookie++; len = 4 + DIRHDSIZ; if (len > uiop->uio_resid) { bigenough = 0; break; } dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_namlen = cookie; dp->d_reclen = len; dp->d_type = DT_DIR; if (cookie == 1) dp->d_fileno = dnp->n_vattr.va_fileid; /* XXX has problems with pynfs virtualhandles */ else dp->d_fileno = dnp->n_dvp != NULL ? VTONFS(dnp->n_dvp)->n_vattr.va_fileid : cookie; p = dp->d_name; *p++ = '.'; if (cookie == 2) *p++ = '.'; *p = '\0'; blksiz += len; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += len; uiop->uio_resid -= len; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + len; uiop->uio_iov->iov_len -= len; } if (cookie == 2) cookie = 0; /* This is sort of ugly, to prevent v4postop() from acting weird */ bzero(&cp, sizeof(cp)); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ /* * XXX this is sort of ugly for nfsv4; we don't maintain the * strict abstraction, but do the decoding inline. that's ok. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIR]++; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); readdir.cnt = nmp->nm_readdirsize; readdir.cookie = cookie; readdir.bm = &nfsv4_readdirbm; if (cookie == 0) bzero(&readdir.verf, sizeof(readdir.verf)); else bcopy(&dnp->n_cookieverf, &readdir.verf, sizeof(readdir.verf)); nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_readdirrpc()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_readdir(&cp, &readdir); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); /* * XXX - Readdir gets handled inline like in * NFSv{2,3}. This is a nasty inconsistency and * should be fixed. */ tl = nfsm_dissect(uint32_t *, 5 * NFSX_UNSIGNED); if (fxdr_unsigned(uint32_t, *tl++) != NFSV4OP_READDIR) { error = EBADRPC; goto nfsmout; } if (fxdr_unsigned(uint32_t, *tl++) != 0) { error = EBADRPC; goto nfsmout; } bcopy(tl, &dnp->n_cookieverf, NFSX_V4VERF); tl += 2; more_dirs = fxdr_unsigned(int, *tl++); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { tl = nfsm_dissect(uint32_t *, 3 * NFSX_UNSIGNED); cookie = fxdr_hyper(tl); tl += 2; /* XXX cookie sanity check */ len = fxdr_unsigned(int, *tl++); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; /* Copy name */ nfsm_mtouio(uiop, len); p = uiop->uio_iov->iov_base; tlen -= len; *p = '\0'; /* null terminate */ /* printf("nfs4_readdirrpc: name: \"%s\" cookie %d\n", p - len, (int) cookie);*/ uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; /* Copy attributes */ nfsm_v4dissect_attrs(&fattr); dp->d_fileno = nfs_v4fileid4_to_fileid( fattr.fa4_valid & FA4V_FILEID && fattr.fa4_fileid ? fattr.fa4_fileid : cookie); fty = (u_int)fattr.fa4_type; dp->d_type = fattr.fa4_valid & FA4V_TYPE && (fty < sizeof(fty_to_dty)) ? fty_to_dty[fty] : DT_UNKNOWN; } else nfsm_adv(nfsm_rndup(len)); tl = nfsm_dissect(uint32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl++); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } error = nfs_v4postop(&cp, error); m_freem(mrep); mrep = NULL; } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirrpc resid > 0\n"); cookiep = nfs4_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: if (mrep != NULL) m_freem(mrep); return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs4_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct sillyrename *sp; struct nfsnode *np; int error; short pid; cache_purge(dvp); np = VTONFS(vp); #ifndef DIAGNOSTIC if (vp->v_type == VDIR) panic("nfs: sillyrename dir"); #endif MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); sp->s_cred = crhold(cnp->cn_cred); sp->s_dvp = dvp; sp->s_removeit = nfs4_removeit; VREF(dvp); /* Fudge together a funny name */ pid = cnp->cn_thread->td_proc->p_pid; sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); /* Try lookitups until we get one that isn't there */ while (nfs4_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_thread, NULL) == 0) { sp->s_name[4]++; if (sp->s_name[4] > 'z') { error = EINVAL; goto bad; } } error = nfs4_renameit(dvp, cnp, sp); if (error) goto bad; error = nfs4_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_thread, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free((caddr_t)sp, M_NFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs4_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred, struct thread *td, struct nfsnode **npp) { struct vnode *newvp = NULL; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos; int error = 0, fhlen; struct mbuf *mreq, *mrep = NULL, *md, *mb; nfsfh_t *nfhp; struct nfs4_compound cp; struct nfs4_oparg_lookup l; struct nfs4_oparg_getfh gfh; struct nfs4_oparg_getattr ga; nfsstats.rpccnt[NFSPROC_RENAME]++; mreq = nfsm_reqhead(dvp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); l.name = name; l.namelen = len; nfs_v4initcompound(&cp); ga.bm = &nfsv4_getattrbm; nfsm_v4build_compound(&cp, "nfs4_renamerpc()"); nfsm_v4build_putfh(&cp, dvp); nfsm_v4build_lookup(&cp, &l); nfsm_v4build_getfh(&cp, &gfh); nfsm_v4build_getattr(&cp, &ga); nfsm_request(dvp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_lookup(&cp); nfsm_v4dissect_getfh(&cp, &gfh); nfsm_v4dissect_getattr(&cp, &ga); if (npp != NULL && error == 0) { nfhp = &gfh.fh_val; fhlen = gfh.fh_len; if (*npp != NULL) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { free((caddr_t)np->n_fhp, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK); bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (newvp != dvp) { np->n_dvp = dvp; np->n_namelen = len; if (np->n_name != NULL) FREE(np->n_name, M_NFSREQ); MALLOC(np->n_name, u_char *, np->n_namelen + 1, M_NFSREQ, M_WAITOK); memcpy(np->n_name, name, len); np->n_name[len] = '\0'; } nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL); } nfsmout: error = nfs_v4postop(&cp, error); if (mrep != NULL) m_freem(mrep); if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } return (error); } /* * Nfs Version 3 commit rpc */ int nfs4_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred, struct thread *td) { struct nfsmount *nmp = VFSTONFS(vp->v_mount); caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfs4_compound cp; struct nfs4_oparg_commit commit; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0); mb = mreq; bpos = mtod(mb, caddr_t); commit.start = offset; commit.len = cnt; nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_commit()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_commit(&cp, &commit); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_commit(&cp, &commit); /* XXX */ /* nfsm_wcc_data(vp, wccflag);*/ if (bcmp(nmp->nm_verf, commit.verf, NFSX_V4VERF)) { bcopy(commit.verf, nmp->nm_verf, NFSX_V4VERF); error = NFSERR_STALEWRITEVERF; } nfsmout: error = nfs_v4postop(&cp, error); if (mrep == NULL) m_freem(mrep); return (error); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the * request. */ static int nfs4_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct ucred *cr; int error = 0; KASSERT(!(bp->b_flags & B_DONE), ("nfs4_strategy: buffer %p unexpectedly marked B_DONE", bp)); KASSERT(BUF_REFCNT(bp) > 0, ("nfs4_strategy: buffer %p not locked", bp)); if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread)) error = nfs_doio(ap->a_vp, bp, cr, curthread); return (error); } /* * fsync vnode op. Just call nfs4_flush() with commit == 1. */ /* ARGSUSED */ static int nfs4_fsync(struct vop_fsync_args *ap) { return (nfs4_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ static int nfs4_flush(struct vnode *vp, int waitfor, struct thread *td, int commit) { struct nfsnode *np = VTONFS(vp); struct buf *bp; int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; int bvecsize = 0, bveccount; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, nfs_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV3(vp) && commit) { s = splbio(); if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; VI_LOCK(vp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs4_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { /* * Release the vnode interlock to avoid a lock * order reversal. */ VI_UNLOCK(vp); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); VI_LOCK(vp); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (bvecpos >= bvecsize) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { nbp = TAILQ_NEXT(bp, b_bobufs); continue; } if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT)) { BUF_UNLOCK(bp); nbp = TAILQ_NEXT(bp, b_bobufs); continue; } VI_UNLOCK(vp); bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. * * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; vfs_busy_pages(bp, 1); VI_LOCK(vp); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_bobufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } splx(s); VI_UNLOCK(vp); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = nfs4_commit(vp, off, (int)(endoff - off), wcred, td); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = nfs4_commit(vp, off, (int)size, bp->b_wcred, td); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ s = splbio(); bufobj_wref(&vp->v_bufobj); bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_dirtyoff = bp->b_dirtyend = 0; splx(s); bufdone(bp); } } } /* * Start/do any write(s) that are required. */ loop: s = splbio(); VI_LOCK(vp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp), "nfsfsync", slpflag, slptimeo); splx(s); if (error == 0) panic("nfs4_fsync: inconsistent lock"); if (error == ENOLCK) goto loop; if (nfs4_sigintr(nmp, NULL, td)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs4_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } VI_UNLOCK(vp); bremfree(bp); if (passone || !commit) bp->b_flags |= B_ASYNC; else bp->b_flags |= B_ASYNC; splx(s); bwrite(bp); goto loop; } splx(s); if (passone) { passone = 0; VI_UNLOCK(vp); goto again; } if (waitfor == MNT_WAIT) { while (vp->v_bufobj.bo_numoutput) { error = bufobj_wwait(&vp->v_bufobj, slpflag, slptimeo); if (error) { VI_UNLOCK(vp); if (nfs4_sigintr(nmp, NULL, td)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } VI_LOCK(vp); } } if (vp->v_bufobj.bo_dirty.bv_cnt > 0 && commit) { VI_UNLOCK(vp); goto loop; } } VI_UNLOCK(vp); if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); return (error); } /* * NFS advisory byte-level locks. */ static int nfs4_advlock(struct vop_advlock_args *ap) { return (EPERM); if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) { struct nfsnode *np = VTONFS(ap->a_vp); return (lf_advlock(ap, &(np->n_lockf), np->n_size)); } return (nfs_dolock(ap)); } /* * Print out the contents of an nfsnode. */ static int nfs4_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); printf("\tfileid %ld fsid 0x%x", np->n_vattr.va_fileid, np->n_vattr.va_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * This is the "real" nfs::bwrite(struct buf*). * We set B_CACHE if this is a VMIO buffer. */ int nfs4_writebp(struct buf *bp, int force __unused, struct thread *td) { int s; int oldflags = bp->b_flags; #if 0 int retv = 1; off_t off; #endif if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); if (bp->b_flags & B_INVAL) { brelse(bp); return(0); } bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_iocmd = BIO_WRITE; bufobj_wref(bp->b_bufobj); curthread->td_ru.ru_oublock++; splx(s); /* * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ vfs_busy_pages(bp, 1); BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if( (oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); if (oldflags & B_DELWRI) { s = splbio(); reassignbuf(bp); splx(s); } brelse(bp); return (rtval); } return (0); } /* * Just call nfs_writebp() with the force argument set to 1. * * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs4_bwrite(struct buf *bp) { return (nfs4_writebp(bp, 1, curthread)); } struct buf_ops buf_ops_nfs4 = { .bop_name = "buf_ops_nfs4", .bop_write = nfs4_bwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; Index: head/sys/nfsclient/nfs_subs.c =================================================================== --- head/sys/nfsclient/nfs_subs.c (revision 175201) +++ head/sys/nfsclient/nfs_subs.c (revision 175202) @@ -1,1187 +1,1187 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfs_xdrneg1; u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted; u_int32_t nfs_true, nfs_false; /* And other global data */ u_int32_t nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct mtx nfs_reqq_mtx; struct nfs_bufq nfs_bufq; struct mtx nfs_xid_mtx; /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; LIST_HEAD(nfsnodehashhead, nfsnode); /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqhead(struct vnode *vp, u_long procid, int hsiz) { struct mbuf *mb; MGET(mb, M_TRYWAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_TRYWAIT); mb->m_len = 0; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type, int auth_len, struct mbuf *mrest, int mrest_len, struct mbuf **mbp, u_int32_t **xidpp) { struct mbuf *mb; u_int32_t *tl; caddr_t bpos; int i; struct mbuf *mreq; int grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_TRYWAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_TRYWAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ tl = nfsm_build(u_int32_t *, 8 * NFSX_UNSIGNED); mtx_lock(&nfs_xid_mtx); /* Get a pretty random xid to start with */ if (!nfs_xid) nfs_xid = random(); /* * Skip zero xid if it should ever happen. */ if (++nfs_xid == 0) nfs_xid++; *xidpp = tl; *tl++ = txdr_unsigned(nfs_xid); mtx_unlock(&nfs_xid_mtx); *tl++ = rpc_call; *tl++ = rpc_vers; *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) { *tl++ = txdr_unsigned(NFS_VER3); *tl++ = txdr_unsigned(procid); } else { *tl++ = txdr_unsigned(NFS_VER2); *tl++ = txdr_unsigned(nfsv2_procid[procid]); } /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: tl = nfsm_build(u_int32_t *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; } /* * And the verifier... */ tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = NULL; *mbp = mb; return (mreq); } /* * copies a uio scatter/gather list to an mbuf chain. * NOTE: can ony handle iovcnt == 1 */ int nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos) { char *uiocp; struct mbuf *mp, *mp2; int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; #ifdef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfsm_uiotombuf: iovcnt != 1"); #endif if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_TRYWAIT, MT_DATA); if (clflg) MCLGET(mp, M_TRYWAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + uiosiz; uiop->uio_iov->iov_len -= uiosiz; siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_TRYWAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz) { struct mbuf *m1 = NULL, *m2; long left, xfer, len, tlen; u_int32_t *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_int32_t *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_TRYWAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_TRYWAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_int32_t *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init(struct vfsconf *vfsp) { int i; nfsmount_zone = uma_zcreate("NFSMOUNT", sizeof(struct nfsmount), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = NULL; nfs_iodmount[i] = NULL; } nfs_nhinit(); /* Init the nfsnode table */ /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); callout_init(&nfs_callout, CALLOUT_MPSAFE); mtx_init(&nfs_reqq_mtx, "NFS reqq lock", NULL, MTX_DEF); mtx_init(&nfs_iod_mtx, "NFS iod lock", NULL, MTX_DEF); mtx_init(&nfs_xid_mtx, "NFS xid lock", NULL, MTX_DEF); nfs_pbuf_freecnt = nswbuf / 2 + 1; return (0); } int nfs_uninit(struct vfsconf *vfsp) { int i; callout_stop(&nfs_callout); KASSERT(TAILQ_EMPTY(&nfs_reqq), ("nfs_uninit: request queue not empty")); /* * Tell all nfsiod processes to exit. Clear nfs_iodmax, and wakeup * any sleeping nfsiods so they check nfs_iodmax and exit. */ mtx_lock(&nfs_iod_mtx); nfs_iodmax = 0; for (i = 0; i < nfs_numasync; i++) if (nfs_iodwant[i]) wakeup(&nfs_iodwant[i]); /* The last nfsiod to exit will wake us up when nfs_numasync hits 0 */ while (nfs_numasync) msleep(&nfs_numasync, &nfs_iod_mtx, PWAIT, "ioddie", 0); mtx_unlock(&nfs_iod_mtx); nfs_nhuninit(); uma_zdestroy(nfsmount_zone); return (0); } void nfs_dircookie_lock(struct nfsnode *np) { mtx_lock(&np->n_mtx); while (np->n_flag & NDIRCOOKIELK) (void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0); np->n_flag |= NDIRCOOKIELK; mtx_unlock(&np->n_mtx); } void nfs_dircookie_unlock(struct nfsnode *np) { mtx_lock(&np->n_mtx); np->n_flag &= ~NDIRCOOKIELK; wakeup(&np->n_flag); mtx_unlock(&np->n_mtx); } int nfs_upgrade_vnlock(struct vnode *vp, struct thread *td) { int old_lock; if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) { if (old_lock == LK_SHARED) { /* Upgrade to exclusive lock, this might block */ - vn_lock(vp, LK_UPGRADE | LK_RETRY, td); + vn_lock(vp, LK_UPGRADE | LK_RETRY); } else { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } } return old_lock; } void nfs_downgrade_vnlock(struct vnode *vp, struct thread *td, int old_lock) { if (old_lock != LK_EXCLUSIVE) { if (old_lock == LK_SHARED) { /* Downgrade from exclusive lock, this might block */ - vn_lock(vp, LK_DOWNGRADE, td); + vn_lock(vp, LK_DOWNGRADE); } else { VOP_UNLOCK(vp, 0, td); } } } void nfs_printf(const char *fmt, ...) { va_list ap; mtx_lock(&Giant); va_start(ap, fmt); printf(fmt, ap); va_end(ap); mtx_unlock(&Giant); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(struct vnode **vpp, struct mbuf **mdp, caddr_t *dposp, struct vattr *vaper, int dontshrink) { struct vnode *vp = *vpp; struct vattr *vap; struct nfs_fattr *fp; struct nfsnode *np; int32_t t1; caddr_t cp2; int rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime, mtime_save; int v3 = NFS_ISV3(vp); struct thread *td = curthread; md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; cp2 = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, M_TRYWAIT); if (cp2 == NULL) return EBADRPC; fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makedev(fxdr_unsigned(int, fp->fa3_rdev.specdata1), fxdr_unsigned(int, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(int32_t, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); mtx_lock(&np->n_mtx); if (vp->v_type != vtyp) { vp->v_type = vtyp; if (vp->v_type == VFIFO) vp->v_op = &nfs_fifoops; np->n_mtime = mtime; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = rdev; mtime_save = vap->va_mtime; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_hyper(&fp->fa3_size); vap->va_blocksize = NFS_FABLKSIZE; vap->va_bytes = fxdr_hyper(&fp->fa3_used); vap->va_fileid = fxdr_unsigned(int32_t, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize); vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.tv_nsec = 0; vap->va_gen = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } np->n_attrstamp = time_second; /* Timestamp the NFS otw getattr fetch */ if (td->td_proc) { np->n_ac_ts_tid = td->td_tid; np->n_ac_ts_pid = td->td_proc->p_pid; np->n_ac_ts_syscalls = td->td_syscalls; } else bzero(&np->n_ac_ts, sizeof(struct nfs_attrcache_timestamp)); if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (dontshrink && vap->va_size < np->n_size) { /* * We've been told not to shrink the file; * zero np->n_attrstamp to indicate that * the attributes are stale. */ vap->va_size = np->n_size; np->n_attrstamp = 0; } else if (np->n_flag & NMODIFIED) { /* * We've modified the file: Use the larger * of our size, and the server's size. */ if (vap->va_size < np->n_size) { vap->va_size = np->n_size; } else { np->n_size = vap->va_size; np->n_flag |= NSIZECHANGED; } } else { np->n_size = vap->va_size; np->n_flag |= NSIZECHANGED; } vnode_pager_setsize(vp, np->n_size); } else { np->n_size = vap->va_size; } } /* * The following checks are added to prevent a race between (say) * a READDIR+ and a WRITE. * READDIR+, WRITE requests sent out. * READDIR+ resp, WRITE resp received on client. * However, the WRITE resp was handled before the READDIR+ resp * causing the post op attrs from the write to be loaded first * and the attrs from the READDIR+ to be loaded later. If this * happens, we have stale attrs loaded into the attrcache. * We detect this by for the mtime moving back. We invalidate the * attrcache when this happens. */ if (timespeccmp(&mtime_save, &vap->va_mtime, >)) /* Size changed or mtime went backwards */ np->n_attrstamp = 0; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } mtx_unlock(&np->n_mtx); return (0); } #ifdef NFS_ACDEBUG #include SYSCTL_DECL(_vfs_nfs); static int nfs_acdebug; SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, ""); #endif /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(struct vnode *vp, struct vattr *vaper) { struct nfsnode *np; struct vattr *vap; struct nfsmount *nmp; int timeo; np = VTONFS(vp); vap = &np->n_vattr; nmp = VFSTONFS(vp->v_mount); #ifdef NFS_ACDEBUG mtx_lock(&Giant); /* nfs_printf() */ #endif mtx_lock(&np->n_mtx); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime.tv_sec) / 10; #ifdef NFS_ACDEBUG if (nfs_acdebug>1) nfs_printf("nfs_getattrcache: initial timeo = %d\n", timeo); #endif if (vap->va_type == VDIR) { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin) timeo = nmp->nm_acdirmin; else if (timeo > nmp->nm_acdirmax) timeo = nmp->nm_acdirmax; } else { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin) timeo = nmp->nm_acregmin; else if (timeo > nmp->nm_acregmax) timeo = nmp->nm_acregmax; } #ifdef NFS_ACDEBUG if (nfs_acdebug > 2) nfs_printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n", nmp->nm_acregmin, nmp->nm_acregmax, nmp->nm_acdirmin, nmp->nm_acdirmax); if (nfs_acdebug) nfs_printf("nfs_getattrcache: age = %d; final timeo = %d\n", (time_second - np->n_attrstamp), timeo); #endif if ((time_second - np->n_attrstamp) >= timeo) { nfsstats.attrcache_misses++; mtx_unlock(&np->n_mtx); return( ENOENT); } nfsstats.attrcache_hits++; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else { np->n_size = vap->va_size; } vnode_pager_setsize(vp, np->n_size); } else { np->n_size = vap->va_size; } } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } mtx_unlock(&np->n_mtx); #ifdef NFS_ACDEBUG mtx_unlock(&Giant); /* nfs_printf() */ #endif return (0); } static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(struct nfsnode *np, off_t off, int add) { struct nfsdmap *dp, *dp2; int pos; nfsuint64 *retval = NULL; pos = (uoff_t)off / NFS_DIRBLKSIZ; if (pos == 0 || off < 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at <= 0"); #endif return (&nfs_nullcookie); } pos--; dp = LIST_FIRST(&np->n_cookies); if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else goto out; } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (LIST_NEXT(dp, ndm_list)) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) goto out; dp = LIST_NEXT(dp, ndm_list); } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else goto out; } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else goto out; } retval = &dp->ndm_cookies[pos]; out: return (retval); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(struct vnode *vp) { struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif nfs_dircookie_lock(np); np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (LIST_FIRST(&np->n_cookies)) LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0; nfs_dircookie_unlock(np); } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * and B_CLUSTEROK flags. Once done the new write verifier can be set for the * mount point. * * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data * writes are not clusterable. */ void nfs_clearcommit(struct mount *mp) { struct vnode *vp, *nvp; struct buf *bp, *nbp; int s; s = splbio(); MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, nvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } VI_UNLOCK(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); splx(s); } /* * Helper functions for former macros. Some of these should be * moved to their callers. */ int nfsm_mtofh_xx(struct vnode *d, struct vnode **v, int v3, int *f, struct mbuf **md, caddr_t *dpos) { struct nfsnode *ttnp; struct vnode *ttvp; nfsfh_t *ttfhp; u_int32_t *tl; int ttfhsize; int t1; if (v3) { tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *f = fxdr_unsigned(int, *tl); } else *f = 1; if (*f) { t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos); if (t1 != 0) return t1; t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp, LK_EXCLUSIVE); if (t1 != 0) return t1; *v = NFSTOV(ttnp); } if (v3) { tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*f) *f = fxdr_unsigned(int, *tl); else if (fxdr_unsigned(int, *tl)) nfsm_adv_xx(NFSX_V3FATTR, md, dpos); } if (*f) { ttvp = *v; t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 0); if (t1) return t1; *v = ttvp; } return 0; } int nfsm_getfh_xx(nfsfh_t **f, int *s, int v3, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; if (v3) { tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int, *tl); if (*s <= 0 || *s > NFSX_V3FHMAX) return EBADRPC; } else *s = NFSX_V2FH; *f = nfsm_dissect_xx(nfsm_rndup(*s), md, dpos); if (*f == NULL) return EBADRPC; else return 0; } int nfsm_loadattr_xx(struct vnode **v, struct vattr *va, struct mbuf **md, caddr_t *dpos) { int t1; struct vnode *ttvp = *v; t1 = nfs_loadattrcache(&ttvp, md, dpos, va, 0); if (t1 != 0) return t1; *v = ttvp; return 0; } int nfsm_postop_attr_xx(struct vnode **v, int *f, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; int t1; struct vnode *ttvp = *v; tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *f = fxdr_unsigned(int, *tl); if (*f != 0) { t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 1); if (t1 != 0) { *f = 0; return t1; } *v = ttvp; } return 0; } int nfsm_wcc_data_xx(struct vnode **v, int *f, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; int ttattrf, ttretf = 0; int t1; tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfs_true) { tl = nfsm_dissect_xx(6 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; mtx_lock(&(VTONFS(*v))->n_mtx); if (*f) ttretf = (VTONFS(*v)->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) && VTONFS(*v)->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3))); mtx_unlock(&(VTONFS(*v))->n_mtx); } t1 = nfsm_postop_attr_xx(v, &ttattrf, md, dpos); if (t1) return t1; if (*f) *f = ttretf; else *f = ttattrf; return 0; } int nfsm_strtom_xx(const char *a, int s, int m, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; int t1; if (s > m) return ENAMETOOLONG; t1 = nfsm_rndup(s) + NFSX_UNSIGNED; if (t1 <= M_TRAILINGSPACE(*mb)) { tl = nfsm_build_xx(t1, mb, bpos); *tl++ = txdr_unsigned(s); *(tl + ((t1 >> 2) - 2)) = 0; bcopy(a, tl, s); } else { t1 = nfsm_strtmbuf(mb, bpos, a, s); if (t1 != 0) return t1; } return 0; } int nfsm_fhtom_xx(struct vnode *v, int v3, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; int t1; caddr_t cp; if (v3) { t1 = nfsm_rndup(VTONFS(v)->n_fhsize) + NFSX_UNSIGNED; if (t1 < M_TRAILINGSPACE(*mb)) { tl = nfsm_build_xx(t1, mb, bpos); *tl++ = txdr_unsigned(VTONFS(v)->n_fhsize); *(tl + ((t1 >> 2) - 2)) = 0; bcopy(VTONFS(v)->n_fhp, tl, VTONFS(v)->n_fhsize); } else { t1 = nfsm_strtmbuf(mb, bpos, (const char *)VTONFS(v)->n_fhp, VTONFS(v)->n_fhsize); if (t1 != 0) return t1; } } else { cp = nfsm_build_xx(NFSX_V2FH, mb, bpos); bcopy(VTONFS(v)->n_fhp, cp, NFSX_V2FH); } return 0; } void nfsm_v3attrbuild_xx(struct vattr *va, int full, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; if (va->va_mode != (mode_t)VNOVAL) { tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos); *tl++ = nfs_true; *tl = txdr_unsigned(va->va_mode); } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = nfs_false; } if (full && va->va_uid != (uid_t)VNOVAL) { tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos); *tl++ = nfs_true; *tl = txdr_unsigned(va->va_uid); } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = nfs_false; } if (full && va->va_gid != (gid_t)VNOVAL) { tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos); *tl++ = nfs_true; *tl = txdr_unsigned(va->va_gid); } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = nfs_false; } if (full && va->va_size != VNOVAL) { tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos); *tl++ = nfs_true; txdr_hyper(va->va_size, tl); } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = nfs_false; } if (va->va_atime.tv_sec != VNOVAL) { if (va->va_atime.tv_sec != time_second) { tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&va->va_atime, tl); } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); } } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } if (va->va_mtime.tv_sec != VNOVAL) { if (va->va_mtime.tv_sec != time_second) { tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&va->va_mtime, tl); } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); } } else { tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos); *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } } Index: head/sys/nfsclient/nfs_vnops.c =================================================================== --- head/sys/nfsclient/nfs_vnops.c (revision 175201) +++ head/sys/nfsclient/nfs_vnops.c (revision 175202) @@ -1,3273 +1,3273 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); /* * vnode op calls for Sun NFS version 2 and 3 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Defs */ #define TRUE 1 #define FALSE 0 /* * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these * calls are not in getblk() and brelse() so that they would not be necessary * here. */ #ifndef B_VMIO #define vfs_busy_pages(bp, f) #endif static vop_read_t nfsfifo_read; static vop_write_t nfsfifo_write; static vop_close_t nfsfifo_close; static int nfs_flush(struct vnode *, int, struct thread *, int); static int nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *, struct thread *); static vop_lookup_t nfs_lookup; static vop_create_t nfs_create; static vop_mknod_t nfs_mknod; static vop_open_t nfs_open; static vop_close_t nfs_close; static vop_access_t nfs_access; static vop_getattr_t nfs_getattr; static vop_setattr_t nfs_setattr; static vop_read_t nfs_read; static vop_fsync_t nfs_fsync; static vop_remove_t nfs_remove; static vop_link_t nfs_link; static vop_rename_t nfs_rename; static vop_mkdir_t nfs_mkdir; static vop_rmdir_t nfs_rmdir; static vop_symlink_t nfs_symlink; static vop_readdir_t nfs_readdir; static vop_strategy_t nfs_strategy; static int nfs_lookitup(struct vnode *, const char *, int, struct ucred *, struct thread *, struct nfsnode **); static int nfs_sillyrename(struct vnode *, struct vnode *, struct componentname *); static vop_access_t nfsspec_access; static vop_readlink_t nfs_readlink; static vop_print_t nfs_print; static vop_advlock_t nfs_advlock; /* * Global vfs data structures for nfs */ struct vop_vector nfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = nfs_access, .vop_advlock = nfs_advlock, .vop_close = nfs_close, .vop_create = nfs_create, .vop_fsync = nfs_fsync, .vop_getattr = nfs_getattr, .vop_getpages = nfs_getpages, .vop_putpages = nfs_putpages, .vop_inactive = nfs_inactive, .vop_lease = VOP_NULL, .vop_link = nfs_link, .vop_lookup = nfs_lookup, .vop_mkdir = nfs_mkdir, .vop_mknod = nfs_mknod, .vop_open = nfs_open, .vop_print = nfs_print, .vop_read = nfs_read, .vop_readdir = nfs_readdir, .vop_readlink = nfs_readlink, .vop_reclaim = nfs_reclaim, .vop_remove = nfs_remove, .vop_rename = nfs_rename, .vop_rmdir = nfs_rmdir, .vop_setattr = nfs_setattr, .vop_strategy = nfs_strategy, .vop_symlink = nfs_symlink, .vop_write = nfs_write, }; struct vop_vector nfs_fifoops = { .vop_default = &fifo_specops, .vop_access = nfsspec_access, .vop_close = nfsfifo_close, .vop_fsync = nfs_fsync, .vop_getattr = nfs_getattr, .vop_inactive = nfs_inactive, .vop_print = nfs_print, .vop_read = nfsfifo_read, .vop_reclaim = nfs_reclaim, .vop_setattr = nfs_setattr, .vop_write = nfsfifo_write, }; static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap); static int nfs_removerpc(struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct thread *td); static int nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td); static int nfs_renameit(struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp); /* * Global variables */ struct mtx nfs_iod_mtx; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) SYSCTL_DECL(_vfs_nfs); static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout"); static int nfsv3_commit_on_close = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW, &nfsv3_commit_on_close, 0, "write+commit on close, else only write"); static int nfs_clean_pages_on_close = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW, &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close"); int nfs_directio_enable = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW, &nfs_directio_enable, 0, "Enable NFS directio"); /* * This sysctl allows other processes to mmap a file that has been opened * O_DIRECT by a process. In general, having processes mmap the file while * Direct IO is in progress can lead to Data Inconsistencies. But, we allow * this by default to prevent DoS attacks - to prevent a malicious user from * opening up files O_DIRECT preventing other users from mmap'ing these * files. "Protected" environments where stricter consistency guarantees are * required can disable this knob. The process that opened the file O_DIRECT * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not * meaningful. */ int nfs_directio_allow_mmap = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW, &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens"); #if 0 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count"); SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD, &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count"); #endif #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \ | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) /* * SMP Locking Note : * The list of locks after the description of the lock is the ordering * of other locks acquired with the lock held. * np->n_mtx : Protects the fields in the nfsnode. VM Object Lock VI_MTX (acquired indirectly) * nmp->nm_mtx : Protects the fields in the nfsmount. rep->r_mtx * nfs_iod_mtx : Global lock, protects shared nfsiod state. * nfs_reqq_mtx : Global lock, protects the nfs_reqq list. nmp->nm_mtx rep->r_mtx * rep->r_mtx : Protects the fields in an nfsreq. */ static int nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td, struct ucred *cred) { const int v3 = 1; u_int32_t *tl; int error = 0, attrflag; struct mbuf *mreq, *mrep, *md, *mb; caddr_t bpos, dpos; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_ACCESS]++; mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, td, cred); nfsm_postop_attr(vp, attrflag); if (!error) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); mtx_lock(&np->n_mtx); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; np->n_modestamp = time_second; mtx_unlock(&np->n_mtx); } m_freem(mrep); nfsmout: return (error); } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int error = 0; u_int32_t mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ if (v3) { if (ap->a_mode & VREAD) mode = NFSV3ACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } else { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } /* XXX safety belt, only make blanket request if caching */ if (nfsaccess_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ mtx_lock(&np->n_mtx); if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) && (ap->a_cred->cr_uid == np->n_modeuid) && ((np->n_mode & mode) == mode)) { nfsstats.accesscache_hits++; } else { /* * Either a no, or a don't know. Go to the wire. */ nfsstats.accesscache_misses++; mtx_unlock(&np->n_mtx); error = nfs3_access_otw(vp, wmode, ap->a_td,ap->a_cred); mtx_lock(&np->n_mtx); if (!error) { if ((np->n_mode & mode) != mode) { error = EACCES; } } } mtx_unlock(&np->n_mtx); return (error); } else { if ((error = nfsspec_access(ap)) != 0) { return (error); } /* * Attempt to prevent a mapped root from accessing a file * which it shouldn't. We try to read a byte from the file * if the user is root and the file is not zero length. * After calling nfsspec_access, we should have the correct * file size cached. */ mtx_lock(&np->n_mtx); if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; mtx_unlock(&np->n_mtx); aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = ap->a_td; if (vp->v_type == VREG) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) error = nfs_readlinkrpc(vp, &auio, ap->a_cred); else error = EACCES; } else mtx_unlock(&np->n_mtx); return (error); } } int nfs_otw_getattr_avoid = 0; /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct vattr vattr; int error; int fmode = ap->a_mode; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) return (EOPNOTSUPP); /* * Get a valid lease. If cached data is stale, flush it. */ mtx_lock(&np->n_mtx); if (np->n_flag & NMODIFIED) { mtx_unlock(&np->n_mtx); error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error == EINTR || error == EIO) return (error); np->n_attrstamp = 0; if (vp->v_type == VDIR) np->n_direofoffset = 0; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); mtx_lock(&np->n_mtx); np->n_mtime = vattr.va_mtime; mtx_unlock(&np->n_mtx); } else { struct thread *td = curthread; if (np->n_ac_ts_syscalls != td->td_syscalls || np->n_ac_ts_tid != td->td_tid || td->td_proc == NULL || np->n_ac_ts_pid != td->td_proc->p_pid) { np->n_attrstamp = 0; } mtx_unlock(&np->n_mtx); error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); mtx_lock(&np->n_mtx); if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { if (vp->v_type == VDIR) np->n_direofoffset = 0; mtx_unlock(&np->n_mtx); error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error == EINTR || error == EIO) { return (error); } mtx_lock(&np->n_mtx); np->n_mtime = vattr.va_mtime; } mtx_unlock(&np->n_mtx); } /* * If the object has >= 1 O_DIRECT active opens, we disable caching. */ if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { if (np->n_directio_opens == 0) { error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error) return (error); mtx_lock(&np->n_mtx); np->n_flag |= NNONCACHE; mtx_unlock(&np->n_mtx); } np->n_directio_opens++; } vnode_create_vobject(vp, vattr.va_size, ap->a_td); return (0); } /* * nfs close vnode op * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: * 1 - So that any write errors may be reported back to the client process * doing the close system call. By far the two most likely errors are * NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure. * 2 - To put a worst case upper bound on cache inconsistency between * multiple clients for the file. * There is also a consistency problem for Version 2 of the protocol w.r.t. * not being able to tell if other clients are writing a file concurrently, * since there is no way of knowing if the changed modify time in the reply * is only due to the write for this client. * (NFS Version 3 provides weak cache consistency data in the reply that * should be sufficient to detect and handle this case.) * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * or commit them (this satisfies 1 and 2 except for the * case where the server crashes after this close but * before the commit RPC, which is felt to be "good * enough". Changing the last argument to nfs_flush() to * a 1 would force a commit operation, if it is felt a * commit is necessary now. */ /* ARGSUSED */ static int nfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); int error = 0; int fmode = ap->a_fflag; if (vp->v_type == VREG) { /* * Examine and clean dirty pages, regardless of NMODIFIED. * This closes a major hole in close-to-open consistency. * We want to push out all dirty pages (and buffers) on * close, regardless of whether they were dirtied by * mmap'ed writes or via write(). */ if (nfs_clean_pages_on_close && vp->v_object) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } mtx_lock(&np->n_mtx); if (np->n_flag & NMODIFIED) { mtx_unlock(&np->n_mtx); if (NFS_ISV3(vp)) { /* * Under NFSv3 we have dirty buffers to dispose of. We * must flush them to the NFS server. We have the option * of waiting all the way through the commit rpc or just * waiting for the initial write. The default is to only * wait through the initial write so the data is in the * server's cache, which is roughly similar to the state * a standard disk subsystem leaves the file in on close(). * * We cannot clear the NMODIFIED bit in np->n_flag due to * potential races with other processes, and certainly * cannot clear it if we don't commit. */ int cm = nfsv3_commit_on_close ? 1 : 0; error = nfs_flush(vp, MNT_WAIT, ap->a_td, cm); /* np->n_flag &= ~NMODIFIED; */ } else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); mtx_lock(&np->n_mtx); } /* * Invalidate the attribute cache in all cases. * An open is going to fetch fresh attrs any way, other procs * on this node that have file open will be forced to do an * otw attr fetch, but this is safe. */ np->n_attrstamp = 0; if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } mtx_unlock(&np->n_mtx); } if (nfs_directio_enable) KASSERT((np->n_directio_asyncwr == 0), ("nfs_close: dirty unflushed (%d) directio buffers\n", np->n_directio_asyncwr)); if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { mtx_lock(&np->n_mtx); KASSERT((np->n_directio_opens > 0), ("nfs_close: unexpectedly value (0) of n_directio_opens\n")); np->n_directio_opens--; if (np->n_directio_opens == 0) np->n_flag &= ~NNONCACHE; mtx_unlock(&np->n_mtx); } return (error); } /* * nfs getattr call from vfs. */ static int nfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb; int v3 = NFS_ISV3(vp); /* * Update local times for special files. */ mtx_lock(&np->n_mtx); if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; mtx_unlock(&np->n_mtx); /* * First look in the cache. */ if (nfs_getattrcache(vp, ap->a_vap) == 0) goto nfsmout; if (v3 && nfsaccess_cache_timeout > 0) { nfsstats.accesscache_misses++; nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_td, ap->a_cred); if (nfs_getattrcache(vp, ap->a_vap) == 0) goto nfsmout; } nfsstats.rpccnt[NFSPROC_GETATTR]++; mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_GETATTR, ap->a_td, ap->a_cred); if (!error) { nfsm_loadattr(vp, ap->a_vap); } m_freem(mrep); nfsmout: return (error); } /* * nfs setattr call. */ static int nfs_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags and marking of atimes are not supported. */ if (vap->va_flags != VNOVAL || (vap->va_vaflags & VA_MARK_ATIME)) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { error = EROFS; goto out; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * We run vnode_pager_setsize() early (why?), * we must set np->n_size now to avoid vinvalbuf * V_SAVE races that might setsize a lower * value. */ mtx_lock(&np->n_mtx); tsize = np->n_size; mtx_unlock(&np->n_mtx); error = nfs_meta_setsize(vp, ap->a_cred, ap->a_td, vap->va_size); mtx_lock(&np->n_mtx); if (np->n_flag & NMODIFIED) { tsize = np->n_size; mtx_unlock(&np->n_mtx); if (vap->va_size == 0) error = nfs_vinvalbuf(vp, 0, ap->a_td, 1); else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error) { vnode_pager_setsize(vp, tsize); goto out; } } else mtx_unlock(&np->n_mtx); /* * np->n_size has already been set to vap->va_size * in nfs_meta_setsize(). We must set it again since * nfs_loadattrcache() could be called through * nfs_meta_setsize() and could modify np->n_size. */ mtx_lock(&np->n_mtx); np->n_vattr.va_size = np->n_size = vap->va_size; mtx_unlock(&np->n_mtx); }; } else { mtx_lock(&np->n_mtx); if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG) { mtx_unlock(&np->n_mtx); if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 && (error == EINTR || error == EIO)) return error; } else mtx_unlock(&np->n_mtx); } error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_td); if (error && vap->va_size != VNOVAL) { mtx_lock(&np->n_mtx); np->n_size = np->n_vattr.va_size = tsize; vnode_pager_setsize(vp, tsize); mtx_unlock(&np->n_mtx); } out: return (error); } /* * Do an nfs setattr rpc. */ static int nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { struct nfsv2_sattr *sp; struct nfsnode *np = VTONFS(vp); caddr_t bpos, dpos; u_int32_t *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; mreq = nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); if (v3) { nfsm_v3attrbuild(vap, TRUE); tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR); if (vap->va_mode == (mode_t)VNOVAL) sp->sa_mode = nfs_xdrneg1; else sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode); if (vap->va_uid == (uid_t)VNOVAL) sp->sa_uid = nfs_xdrneg1; else sp->sa_uid = txdr_unsigned(vap->va_uid); if (vap->va_gid == (gid_t)VNOVAL) sp->sa_gid = nfs_xdrneg1; else sp->sa_gid = txdr_unsigned(vap->va_gid); sp->sa_size = txdr_unsigned(vap->va_size); txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(vp, NFSPROC_SETATTR, td, cred); if (v3) { np->n_modestamp = 0; nfsm_wcc_data(vp, wccflag); } else nfsm_loadattr(vp, NULL); m_freem(mrep); nfsmout: return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int flags = cnp->cn_flags; struct vnode *newvp; struct nfsmount *nmp; caddr_t bpos, dpos; struct mbuf *mreq, *mrep, *md, *mb; long len; nfsfh_t *fhp; struct nfsnode *np; int error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct thread *td = cnp->cn_thread; *vpp = NULLVP; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) { *vpp = NULLVP; return (error); } if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { struct vattr vattr; newvp = *vpp; if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, td) && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); if (dvp != newvp) vput(newvp); else vrele(newvp); *vpp = NULLVP; } error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; nfsstats.rpccnt[NFSPROC_LOOKUP]++; len = cnp->cn_namelen; mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred); if (error) { if (v3) { nfsm_postop_attr(dvp, attrflag); m_freem(mrep); } goto nfsmout; } nfsm_getfh(fhp, fhsize, v3); /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) { m_freem(mrep); return (EISDIR); } error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, NULL); *vpp = newvp; m_freem(mrep); cnp->cn_flags |= SAVENAME; return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); newvp = NFSTOV(np); } else if (NFS_CMPFH(np, fhp, fhsize)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, NULL); if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(dvp, newvp, cnp); } *vpp = newvp; m_freem(mrep); nfsmout: if (error) { if (newvp != NULLVP) { vput(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { if (dvp->v_mount->mnt_flag & MNT_RDONLY) error = EROFS; else error = EJUSTRETURN; } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } return (error); } /* * nfs read call. * Just call nfs_bioread() to do the work. */ static int nfs_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; switch (vp->v_type) { case VREG: return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); case VDIR: return (EISDIR); default: return (EOPNOTSUPP); } } /* * nfs readlink call */ static int nfs_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by nfs_doio() from below the buffer cache. */ int nfs_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { caddr_t bpos, dpos; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_READLINK]++; mreq = nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred); if (v3) nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_strsiz(len, NFS_MAXPATHLEN); if (len == NFS_MAXPATHLEN) { struct nfsnode *np = VTONFS(vp); mtx_lock(&np->n_mtx); if (np->n_size && np->n_size < NFS_MAXPATHLEN) len = np->n_size; mtx_unlock(&np->n_mtx); } nfsm_mtouio(uiop, len); } m_freem(mrep); nfsmout: return (error); } /* * nfs read rpc call * Ditto above */ int nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { u_int32_t *tl; caddr_t bpos, dpos; struct mbuf *mreq, *mrep, *md, *mb; struct nfsmount *nmp; int error = 0, len, retlen, tsiz, eof, attrflag; int v3 = NFS_ISV3(vp); int rsize; #ifndef nolint eof = 0; #endif nmp = VFSTONFS(vp->v_mount); tsiz = uiop->uio_resid; mtx_lock(&nmp->nm_mtx); if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) { mtx_unlock(&nmp->nm_mtx); return (EFBIG); } rsize = nmp->nm_rsize; mtx_unlock(&nmp->nm_mtx); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > rsize) ? rsize : tsiz; mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED * 3); if (v3) { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } else { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else { nfsm_loadattr(vp, NULL); } nfsm_strsiz(retlen, rsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; if (v3) { if (eof || retlen == 0) { tsiz = 0; } } else if (retlen < len) { tsiz = 0; } } nfsmout: return (error); } /* * nfs write call */ int nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred, int *iomode, int *must_commit) { u_int32_t *tl; int32_t backup; caddr_t bpos, dpos; struct mbuf *mreq, *mrep, *md, *mb; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; int wsize; #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs: writerpc iovcnt > 1"); #endif *must_commit = 0; tsiz = uiop->uio_resid; mtx_lock(&nmp->nm_mtx); if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) { mtx_unlock(&nmp->nm_mtx); return (EFBIG); } wsize = nmp->nm_wsize; mtx_unlock(&nmp->nm_mtx); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > wsize) ? wsize : tsiz; mreq = nfsm_reqhead(vp, NFSPROC_WRITE, NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); if (v3) { tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { u_int32_t x; tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED); /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiotom(uiop, len); nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred); if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag); if (!error) { tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; m_freem(mrep); break; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest committment level * obtained by any of the RPCs. */ if (committed == NFSV3WRITE_FILESYNC) committed = commit; else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; mtx_lock(&nmp->nm_mtx); if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); } mtx_unlock(&nmp->nm_mtx); } } else { nfsm_loadattr(vp, NULL); } if (wccflag) { mtx_lock(&(VTONFS(vp))->n_mtx); VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime; mtx_unlock(&(VTONFS(vp))->n_mtx); } m_freem(mrep); if (error) break; tsiz -= len; } nfsmout: if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) { struct nfsv2_sattr *sp; u_int32_t *tl; struct vnode *newvp = NULL; struct nfsnode *np = NULL; struct vattr vattr; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb; u_int32_t rdev; int v3 = NFS_ISV3(dvp); if (vap->va_type == VCHR || vap->va_type == VBLK) rdev = txdr_unsigned(vap->va_rdev); else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = nfs_xdrneg1; else { return (EOPNOTSUPP); } if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) { return (error); } nfsstats.rpccnt[NFSPROC_MKNOD]++; mreq = nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED + + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl++ = vtonfsv3_type(vap->va_type); nfsm_v3attrbuild(vap, FALSE); if (vap->va_type == VCHR || vap->va_type == VBLK) { tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(umajor(vap->va_rdev)); *tl = txdr_unsigned(uminor(vap->va_rdev)); } } else { sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = rdev; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = NULL; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); m_freem(mrep); nfsmout: if (error) { if (newvp) vput(newvp); } else { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *vpp = newvp; } mtx_lock(&(VTONFS(dvp))->n_mtx); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; mtx_unlock(&(VTONFS(dvp))->n_mtx); return (error); } /* * nfs mknod vop * just call nfs_mknodrpc() to do the work. */ /* ARGSUSED */ static int nfs_mknod(struct vop_mknod_args *ap) { return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap)); } static u_long create_verf; /* * nfs file create call */ static int nfs_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct nfsv2_sattr *sp; u_int32_t *tl; struct nfsnode *np = NULL; struct vnode *newvp = NULL; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0; struct mbuf *mreq, *mrep, *md, *mb; struct vattr vattr; int v3 = NFS_ISV3(dvp); /* * Oops, not for me.. */ if (vap->va_type == VSOCK) return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) { return (error); } if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; again: nfsstats.rpccnt[NFSPROC_CREATE]++; mreq = nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET if (!TAILQ_EMPTY(&in_ifaddrhead)) *tl++ = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr.s_addr; else #endif *tl++ = create_verf; *tl = ++create_verf; } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); nfsm_v3attrbuild(vap, FALSE); } } else { sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = 0; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = NULL; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); m_freem(mrep); nfsmout: if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { fmode &= ~O_EXCL; goto again; } if (newvp) vput(newvp); } else if (v3 && (fmode & O_EXCL)) { /* * We are normally called with only a partially initialized * VAP. Since the NFSv3 spec says that server may use the * file attributes to store the verifier, the spec requires * us to do a SETATTR RPC. FreeBSD servers store the verifier * in atime, but we can't really assume that all servers will * so we ensure that our SETATTR sets both atime and mtime. */ if (vap->va_mtime.tv_sec == VNOVAL) vfs_timestamp(&vap->va_mtime); if (vap->va_atime.tv_sec == VNOVAL) vap->va_atime = vap->va_mtime; error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_thread); if (error) vput(newvp); } if (!error) { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } mtx_lock(&(VTONFS(dvp))->n_mtx); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; mtx_unlock(&(VTONFS(dvp))->n_mtx); return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs_sillyrename() to set it up * else * do the remove rpc */ static int nfs_remove(struct vop_remove_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; #ifndef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("nfs_remove: no name"); if (vrefcnt(vp) < 1) panic("nfs_remove: bad v_usecount"); #endif if (vp->v_type == VDIR) error = EPERM; else if (vrefcnt(vp) == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_thread) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1); /* Do the rpc */ if (error != EINTR && error != EIO) error = nfs_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); np->n_attrstamp = 0; return (error); } /* * nfs file remove rpc called from nfs_inactive */ int nfs_removeit(struct sillyrename *sp) { /* * Make sure that the directory vnode is still valid. * XXX we should lock sp->s_dvp here. */ if (sp->s_dvp->v_type == VBAD) return (0); return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, NULL)); } /* * Nfs remove rpc, called from nfs_remove() and nfs_removeit(). */ static int nfs_removerpc(struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct thread *td) { caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_REMOVE]++; mreq = nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(name, namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_REMOVE, td, cred); if (v3) nfsm_wcc_data(dvp, wccflag); m_freem(mrep); nfsmout: mtx_lock(&(VTONFS(dvp))->n_mtx); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; mtx_unlock(&(VTONFS(dvp))->n_mtx); return (error); } /* * nfs file rename call */ static int nfs_rename(struct vop_rename_args *ap) { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; int error; #ifndef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("nfs_rename: no name"); #endif /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (fvp == tvp) { nfs_printf("nfs_rename: fvp == tvp (can't happen)\n"); error = 0; goto out; } - if ((error = vn_lock(fvp, LK_EXCLUSIVE, fcnp->cn_thread)) != 0) + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto out; /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. * * Skip the rename operation if the fsync fails, this can happen * due to the server's volume being full, when we pushed out data * that was written back to our cache earlier. Not checking for * this condition can result in potential (silent) data loss. */ error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread); VOP_UNLOCK(fvp, 0, fcnp->cn_thread); if (!error && tvp) error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread); if (error) goto out; /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_thread); if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs_remove() above */ static int nfs_renameit(struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp) { return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread)); } /* * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). */ static int nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td) { caddr_t bpos, dpos; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb; int v3 = NFS_ISV3(fdvp); nfsstats.rpccnt[NFSPROC_RENAME]++; mreq = nfsm_reqhead(fdvp, NFSPROC_RENAME, (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(fdvp, v3); nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); nfsm_fhtom(tdvp, v3); nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); nfsm_request(fdvp, NFSPROC_RENAME, td, cred); if (v3) { nfsm_wcc_data(fdvp, fwccflag); nfsm_wcc_data(tdvp, twccflag); } m_freem(mrep); nfsmout: mtx_lock(&(VTONFS(fdvp))->n_mtx); VTONFS(fdvp)->n_flag |= NMODIFIED; mtx_unlock(&(VTONFS(fdvp))->n_mtx); mtx_lock(&(VTONFS(tdvp))->n_mtx); VTONFS(tdvp)->n_flag |= NMODIFIED; mtx_unlock(&(VTONFS(tdvp))->n_mtx); if (!fwccflag) VTONFS(fdvp)->n_attrstamp = 0; if (!twccflag) VTONFS(tdvp)->n_attrstamp = 0; return (error); } /* * nfs hard link create call */ static int nfs_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb; int v3; if (vp->v_mount != tdvp->v_mount) { return (EXDEV); } /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread); v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_LINK]++; mreq = nfsm_reqhead(vp, NFSPROC_LINK, NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred); if (v3) { nfsm_postop_attr(vp, attrflag); nfsm_wcc_data(tdvp, wccflag); } m_freem(mrep); nfsmout: mtx_lock(&(VTONFS(tdvp))->n_mtx); VTONFS(tdvp)->n_flag |= NMODIFIED; mtx_unlock(&(VTONFS(tdvp))->n_mtx); if (!attrflag) VTONFS(vp)->n_attrstamp = 0; if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; return (error); } /* * nfs symbolic link create call */ static int nfs_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct nfsv2_sattr *sp; caddr_t bpos, dpos; int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp; struct mbuf *mreq, *mrep, *md, *mb; struct vnode *newvp = NULL; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); mreq = nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); if (!v3) { sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } /* * Issue the NFS request and get the rpc response. * * Only NFSv3 responses returning an error of 0 actually return * a file handle that can be converted into newvp without having * to do an extra lookup rpc. */ nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred); if (v3) { if (error == 0) nfsm_mtofh(dvp, newvp, v3, gotvp); nfsm_wcc_data(dvp, wccflag); } /* * out code jumps -> here, mrep is also freed. */ m_freem(mrep); nfsmout: /* * If we do not have an error and we could not extract the newvp from * the response due to the request being NFSv2, we have to do a * lookup in order to obtain a newvp to return. */ if (error == 0 && newvp == NULL) { struct nfsnode *np = NULL; error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np); if (!error) newvp = NFSTOV(np); } if (error) { if (newvp) vput(newvp); } else { *ap->a_vpp = newvp; } mtx_lock(&(VTONFS(dvp))->n_mtx); VTONFS(dvp)->n_flag |= NMODIFIED; mtx_unlock(&(VTONFS(dvp))->n_mtx); if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs make dir call */ static int nfs_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct nfsv2_sattr *sp; int len; struct nfsnode *np = NULL; struct vnode *newvp = NULL; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR; int gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb; struct vattr vattr; int v3 = NFS_ISV3(dvp); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) { return (error); } len = cnp->cn_namelen; nfsstats.rpccnt[NFSPROC_MKDIR]++; mreq = nfsm_reqhead(dvp, NFSPROC_MKDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } else { sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred); if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); if (v3) nfsm_wcc_data(dvp, wccflag); m_freem(mrep); nfsmout: mtx_lock(&(VTONFS(dvp))->n_mtx); VTONFS(dvp)->n_flag |= NMODIFIED; mtx_unlock(&(VTONFS(dvp))->n_mtx); if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; if (error == 0 && newvp == NULL) { error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, cnp->cn_thread, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) error = EEXIST; } } if (error) { if (newvp) vput(newvp); } else *ap->a_vpp = newvp; return (error); } /* * nfs remove directory call */ static int nfs_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb; int v3 = NFS_ISV3(dvp); if (dvp == vp) return (EINVAL); nfsstats.rpccnt[NFSPROC_RMDIR]++; mreq = nfsm_reqhead(dvp, NFSPROC_RMDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred); if (v3) nfsm_wcc_data(dvp, wccflag); m_freem(mrep); nfsmout: mtx_lock(&(VTONFS(dvp))->n_mtx); VTONFS(dvp)->n_flag |= NMODIFIED; mtx_unlock(&(VTONFS(dvp))->n_mtx); if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs readdir call */ static int nfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct uio *uio = ap->a_uio; int tresid, error = 0; struct vattr vattr; if (vp->v_type != VDIR) return(EPERM); /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0) { mtx_lock(&np->n_mtx); if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { mtx_unlock(&np->n_mtx); nfsstats.direofcache_hits++; goto out; } else mtx_unlock(&np->n_mtx); } } /* * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) { nfsstats.direofcache_misses++; } out: return (error); } /* * Readdir rpc call. * Called from below the buffer cache by nfs_doio(). */ int nfs_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { int len, left; struct dirent *dp = NULL; u_int32_t *tl; caddr_t cp; nfsuint64 *cookiep; caddr_t bpos, dpos; struct mbuf *mreq, *mrep, *md, *mb; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; int v3 = NFS_ISV3(vp); #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif /* * If there is no cookie, assume directory was stale. */ nfs_dircookie_lock(dnp); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) { cookie = *cookiep; nfs_dircookie_unlock(dnp); } else { nfs_dircookie_unlock(dnp); return (NFSERR_BAD_COOKIE); } /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIR]++; mreq = nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) + NFSX_READDIR(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); if (v3) { tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; mtx_lock(&dnp->n_mtx); *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; mtx_unlock(&dnp->n_mtx); } else { tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } *tl = txdr_unsigned(nmp->nm_readdirsize); nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (!error) { tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED); mtx_lock(&dnp->n_mtx); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; mtx_unlock(&dnp->n_mtx); } else { m_freem(mrep); goto nfsmout; } } tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (v3) { tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); } else { tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED); fileno = fxdr_unsigned(u_quad_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; /* null terminate */ uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); if (v3) { tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED); } else { tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED); } if (bigenough) { cookie.nfsuquad[0] = *tl++; if (v3) cookie.nfsuquad[1] = *tl++; } else if (v3) tl += 2; else tl++; more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) nfs_printf("EEK! readdirrpc resid > 0\n"); nfs_dircookie_lock(dnp); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; nfs_dircookie_unlock(dnp); } nfsmout: return (error); } /* * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc(). */ int nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { int len, left; struct dirent *dp; u_int32_t *tl; caddr_t cp; struct vnode *newvp; nfsuint64 *cookiep; caddr_t bpos, dpos, dpossav1, dpossav2; struct mbuf *mreq, *mrep, *md, *mb, *mdsav1, *mdsav2; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize; #ifndef nolint dp = NULL; #endif #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirplusrpc bad uio"); #endif ndp->ni_dvp = vp; newvp = NULLVP; /* * If there is no cookie, assume directory was stale. */ nfs_dircookie_lock(dnp); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) { cookie = *cookiep; nfs_dircookie_unlock(dnp); } else { nfs_dircookie_unlock(dnp); return (NFSERR_BAD_COOKIE); } /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIRPLUS]++; mreq = nfsm_reqhead(vp, NFSPROC_READDIRPLUS, NFSX_FH(1) + 6 * NFSX_UNSIGNED); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, 1); tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; mtx_lock(&dnp->n_mtx); *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; mtx_unlock(&dnp->n_mtx); *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_rsize); nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred); nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED); mtx_lock(&dnp->n_mtx); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; mtx_unlock(&dnp->n_mtx); more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination*/ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED); if (bigenough) { cookie.nfsuquad[0] = *tl++; cookie.nfsuquad[1] = *tl++; } else tl += 2; /* * Since the attributes are before the file handle * (sigh), we must skip over the attributes and then * come back and get them. */ attrflag = fxdr_unsigned(int, *tl); if (attrflag) { dpossav1 = dpos; mdsav1 = md; nfsm_adv(NFSX_V3FATTR); tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); doit = fxdr_unsigned(int, *tl); /* * Skip loading the attrs for "..". There's a * race between loading the attrs here and * lookups that look for the directory currently * being read (in the parent). We try to acquire * the exclusive lock on ".." here, owning the * lock on the directory being read. Lookup will * hold the lock on ".." and try to acquire the * lock on the directory being read. * * There are other ways of fixing this, one would * be to do a trylock on the ".." vnode and skip * loading the attrs on ".." if it happens to be * locked by another process. But skipping the * attrload on ".." seems the easiest option. */ if (strcmp(dp->d_name, "..") == 0) { doit = 0; /* * We've already skipped over the attrs, * skip over the filehandle. And store d_type * as VDIR. */ tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); nfsm_adv(nfsm_rndup(i)); dp->d_type = IFTODT(VTTOIF(VDIR)); } if (doit) { nfsm_getfh(fhp, fhsize, 1); if (NFS_CMPFH(dnp, fhp, fhsize)) { VREF(vp); newvp = vp; np = dnp; } else { error = nfs_nget(vp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) doit = 0; else newvp = NFSTOV(np); } } if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; md = mdsav1; nfsm_loadattr(newvp, NULL); dpos = dpossav2; md = mdsav2; dp->d_type = IFTODT(VTTOIF(np->n_vattr.va_type)); ndp->ni_vp = newvp; /* Update n_ctime, so subsequent lookup doesn't purge entry */ np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); } } else { /* Just skip over the file handle */ tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); if (i) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); fhsize = fxdr_unsigned(int, *tl); nfsm_adv(nfsm_rndup(fhsize)); } } if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) nfs_printf("EEK! readdirplusrpc resid > 0\n"); nfs_dircookie_lock(dnp); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; nfs_dircookie_unlock(dnp); } nfsmout: if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct sillyrename *sp; struct nfsnode *np; int error; short pid; unsigned int lticks; cache_purge(dvp); np = VTONFS(vp); #ifndef DIAGNOSTIC if (vp->v_type == VDIR) panic("nfs: sillyrename dir"); #endif MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); sp->s_cred = crhold(cnp->cn_cred); sp->s_dvp = dvp; sp->s_removeit = nfs_removeit; VREF(dvp); /* * Fudge together a funny name. * Changing the format of the funny name to accomodate more * sillynames per directory. * The name is now changed to .nfs...4, where ticks is * CPU ticks since boot. */ pid = cnp->cn_thread->td_proc->p_pid; lticks = (unsigned int)ticks; for ( ; ; ) { sp->s_namlen = sprintf(sp->s_name, ".nfs.%08x.%04x4.4", lticks, pid); if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_thread, NULL)) break; lticks++; } error = nfs_renameit(dvp, cnp, sp); if (error) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_thread, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free((caddr_t)sp, M_NFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred, struct thread *td, struct nfsnode **npp) { struct vnode *newvp = NULL; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos; int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb; nfsfh_t *nfhp; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_LOOKUP]++; mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(dvp, v3); nfsm_strtom(name, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, td, cred); if (npp && !error) { nfsm_getfh(nfhp, fhlen, v3); if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { free((caddr_t)np->n_fhp, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK); bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); if (!attrflag && *npp == NULL) { m_freem(mrep); if (newvp == dvp) vrele(newvp); else vput(newvp); return (ENOENT); } } else nfsm_loadattr(newvp, NULL); } m_freem(mrep); nfsmout: if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } return (error); } /* * Nfs Version 3 commit rpc */ int nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred, struct thread *td) { u_int32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb; mtx_lock(&nmp->nm_mtx); if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) { mtx_unlock(&nmp->nm_mtx); return (0); } mtx_unlock(&nmp->nm_mtx); nfsstats.rpccnt[NFSPROC_COMMIT]++; mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, 1); tl = nfsm_build(u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsm_request(vp, NFSPROC_COMMIT, td, cred); nfsm_wcc_data(vp, wccflag); if (!error) { tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF); if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl, NFSX_V3WRITEVERF)) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); error = NFSERR_STALEWRITEVERF; } } m_freem(mrep); nfsmout: return (error); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the * request. */ static int nfs_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct ucred *cr; KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp)); if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread)) (void)nfs_doio(ap->a_vp, bp, cr, curthread); return (0); } /* * fsync vnode op. Just call nfs_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(struct vop_fsync_args *ap) { return (nfs_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ static int nfs_flush(struct vnode *vp, int waitfor, struct thread *td, int commit) { struct nfsnode *np = VTONFS(vp); struct buf *bp; int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; int bvecsize = 0, bveccount; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but has not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, nfs_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV3(vp) && commit) { s = splbio(); if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; VI_LOCK(vp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { /* * Release the vnode interlock to avoid a lock * order reversal. */ VI_UNLOCK(vp); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); VI_LOCK(vp); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (bvecpos >= bvecsize) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { nbp = TAILQ_NEXT(bp, b_bobufs); continue; } if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT)) { BUF_UNLOCK(bp); nbp = TAILQ_NEXT(bp, b_bobufs); continue; } VI_UNLOCK(vp); bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. * * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; vfs_busy_pages(bp, 1); VI_LOCK(vp); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_bobufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } splx(s); VI_UNLOCK(vp); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = nfs_commit(vp, off, (int)(endoff - off), wcred, td); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = nfs_commit(vp, off, (int)size, bp->b_wcred, td); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ s = splbio(); bufobj_wref(&vp->v_bufobj); bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_dirtyoff = bp->b_dirtyend = 0; splx(s); bufdone(bp); } } } /* * Start/do any write(s) that are required. */ loop: s = splbio(); VI_LOCK(vp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp), "nfsfsync", slpflag, slptimeo); splx(s); if (error == 0) { BUF_UNLOCK(bp); goto loop; } if (error == ENOLCK) goto loop; if (nfs_sigintr(nmp, NULL, td)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } VI_UNLOCK(vp); bremfree(bp); if (passone || !commit) bp->b_flags |= B_ASYNC; else bp->b_flags |= B_ASYNC; splx(s); bwrite(bp); if (nfs_sigintr(nmp, NULL, td)) { error = EINTR; goto done; } goto loop; } splx(s); if (passone) { passone = 0; VI_UNLOCK(vp); goto again; } if (waitfor == MNT_WAIT) { while (vp->v_bufobj.bo_numoutput) { error = bufobj_wwait(&vp->v_bufobj, slpflag, slptimeo); if (error) { VI_UNLOCK(vp); error = nfs_sigintr(nmp, NULL, td); if (error) goto done; if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } VI_LOCK(vp); } } if (vp->v_bufobj.bo_dirty.bv_cnt != 0 && commit) { VI_UNLOCK(vp); goto loop; } /* * Wait for all the async IO requests to drain */ VI_UNLOCK(vp); mtx_lock(&np->n_mtx); while (np->n_directio_asyncwr > 0) { np->n_flag |= NFSYNCWAIT; error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr, &np->n_mtx, slpflag | (PRIBIO + 1), "nfsfsync", 0); if (error) { if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) { mtx_unlock(&np->n_mtx); error = EINTR; goto done; } } } mtx_unlock(&np->n_mtx); } else VI_UNLOCK(vp); mtx_lock(&np->n_mtx); if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0 && vp->v_bufobj.bo_numoutput == 0 && np->n_directio_asyncwr == 0) np->n_flag &= ~NMODIFIED; mtx_unlock(&np->n_mtx); done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); return (error); } /* * NFS advisory byte-level locks. */ static int nfs_advlock(struct vop_advlock_args *ap) { int error; mtx_lock(&Giant); if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) { struct nfsnode *np = VTONFS(ap->a_vp); error = lf_advlock(ap, &(np->n_lockf), np->n_size); goto out; } error = nfs_dolock(ap); out: mtx_unlock(&Giant); return (error); } /* * Print out the contents of an nfsnode. */ static int nfs_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); nfs_printf("\tfileid %ld fsid 0x%x", np->n_vattr.va_fileid, np->n_vattr.va_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * This is the "real" nfs::bwrite(struct buf*). * We set B_CACHE if this is a VMIO buffer. */ int nfs_writebp(struct buf *bp, int force __unused, struct thread *td) { int s; int oldflags = bp->b_flags; #if 0 int retv = 1; off_t off; #endif if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); if (bp->b_flags & B_INVAL) { brelse(bp); return(0); } bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_iocmd = BIO_WRITE; bufobj_wref(bp->b_bufobj); curthread->td_ru.ru_oublock++; splx(s); /* * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ vfs_busy_pages(bp, 1); BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if( (oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); if (oldflags & B_DELWRI) { s = splbio(); reassignbuf(bp); splx(s); } brelse(bp); return (rtval); } return (0); } /* * nfs special file access vnode op. * Essentially just get vattr and then imitate iaccess() since the device is * local to the client. */ static int nfsspec_access(struct vop_access_args *ap) { struct vattr *vap; struct ucred *cred = ap->a_cred; struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; struct vattr vattr; int error; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } vap = &vattr; error = VOP_GETATTR(vp, vap, cred, ap->a_td); if (error) goto out; error = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid, mode, cred, NULL); out: return error; } /* * Read wrapper for fifos. */ static int nfsfifo_read(struct vop_read_args *ap) { struct nfsnode *np = VTONFS(ap->a_vp); int error; /* * Set access flag. */ mtx_lock(&np->n_mtx); np->n_flag |= NACC; getnanotime(&np->n_atim); mtx_unlock(&np->n_mtx); error = fifo_specops.vop_read(ap); return error; } /* * Write wrapper for fifos. */ static int nfsfifo_write(struct vop_write_args *ap) { struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ mtx_lock(&np->n_mtx); np->n_flag |= NUPD; getnanotime(&np->n_mtim); mtx_unlock(&np->n_mtx); return(fifo_specops.vop_write(ap)); } /* * Close wrapper for fifos. * * Update the times on the nfsnode then do fifo close. */ static int nfsfifo_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct vattr vattr; struct timespec ts; mtx_lock(&np->n_mtx); if (np->n_flag & (NACC | NUPD)) { getnanotime(&ts); if (np->n_flag & NACC) np->n_atim = ts; if (np->n_flag & NUPD) np->n_mtim = ts; np->n_flag |= NCHG; if (vrefcnt(vp) == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; mtx_unlock(&np->n_mtx); (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_td); goto out; } } mtx_unlock(&np->n_mtx); out: return (fifo_specops.vop_close(ap)); } /* * Just call nfs_writebp() with the force argument set to 1. * * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs_bwrite(struct buf *bp) { return (nfs_writebp(bp, 1, curthread)); } struct buf_ops buf_ops_nfs = { .bop_name = "buf_ops_nfs", .bop_write = nfs_bwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; Index: head/sys/nfsserver/nfs_serv.c =================================================================== --- head/sys/nfsserver/nfs_serv.c (revision 175201) +++ head/sys/nfsserver/nfs_serv.c (revision 175202) @@ -1,4284 +1,4284 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_serv.c 8.8 (Berkeley) 7/31/95 */ #include __FBSDID("$FreeBSD$"); /* * nfs version 2 and 3 server calls to vnode ops * - these routines generally have 3 phases * 1 - break down and validate rpc request in mbuf list * 2 - do the vnode ops for the request * (surprisingly ?? many are very similar to syscalls in vfs_syscalls.c) * 3 - build the rpc reply in an mbuf list * nb: * - do not mix the phases, since the nfsm_?? macros can return failures * on a bad rpc or similar and do not do any vrele() or vput()'s * * - the nfsm_reply() macro generates an nfs rpc reply with the nfs * error number iff error != 0 whereas * returning an error from the server function implies a fatal error * such as a badly constructed rpc request that should be dropped without * a reply. * For nfsm_reply(), the case where error == EBADRPC is treated * specially; after constructing a reply, it does an immediate * `goto nfsmout' to avoid getting any V3 post-op status appended. * * Other notes: * Warning: always pay careful attention to resource cleanup on return * and note that nfsm_*() macros can terminate a procedure on certain * errors. * * lookup() and namei() * may return garbage in various structural fields/return elements * if an error is returned, and may garbage up nd.ni_dvp even if no * error is returned and you did not request LOCKPARENT or WANTPARENT. * * We use the ni_cnd.cn_flags 'HASBUF' flag to track whether the name * buffer has been freed or not. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NFSRV_DEBUG #define nfsdbprintf(info) printf info #else #define nfsdbprintf(info) #endif #define MAX_COMMIT_COUNT (1024 * 1024) #define NUM_HEURISTIC 1017 #define NHUSE_INIT 64 #define NHUSE_INC 16 #define NHUSE_MAX 2048 static struct nfsheur { struct vnode *nh_vp; /* vp to match (unreferenced pointer) */ off_t nh_nextr; /* next offset for sequential detection */ int nh_use; /* use count for selection */ int nh_seqcount; /* heuristic */ } nfsheur[NUM_HEURISTIC]; /* Global vars */ int nfsrvw_procrastinate = NFS_GATHERDELAY * 1000; int nfsrvw_procrastinate_v3 = 0; static struct timeval nfsver = { 0 }; SYSCTL_NODE(_vfs, OID_AUTO, nfsrv, CTLFLAG_RW, 0, "NFS server"); static int nfs_async; static int nfs_commit_blks; static int nfs_commit_miss; SYSCTL_INT(_vfs_nfsrv, OID_AUTO, async, CTLFLAG_RW, &nfs_async, 0, ""); SYSCTL_INT(_vfs_nfsrv, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks, 0, ""); SYSCTL_INT(_vfs_nfsrv, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss, 0, ""); struct nfsrvstats nfsrvstats; SYSCTL_STRUCT(_vfs_nfsrv, NFS_NFSRVSTATS, nfsrvstats, CTLFLAG_RW, &nfsrvstats, nfsrvstats, "S,nfsrvstats"); static int nfsrv_access(struct vnode *, int, struct ucred *, int, struct thread *, int); static void nfsrvw_coalesce(struct nfsrv_descript *, struct nfsrv_descript *); /* * Clear nameidata fields that are tested in nsfmout cleanup code prior * to using first nfsm macro (that might jump to the cleanup code). */ static __inline void ndclear(struct nameidata *nd) { nd->ni_cnd.cn_flags = 0; nd->ni_vp = NULL; nd->ni_dvp = NULL; nd->ni_startdir = NULL; } /* * Takes two vfslocked integers and returns with at most one * reference to giant. The return value indicates whether giant * is held by either lock. This simplifies nfsrv ops by allowing * them to track only one vfslocked var. */ static __inline int nfsrv_lockedpair(int vfs1, int vfs2) { if (vfs1 && vfs2) VFS_UNLOCK_GIANT(vfs2); return (vfs1 | vfs2); } static __inline int nfsrv_lockedpair_nd(int vfs1, struct nameidata *nd) { int vfs2; vfs2 = NDHASGIANT(nd); return nfsrv_lockedpair(vfs1, vfs2); } /* * nfs v3 access service */ int nfsrv3_access(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct vnode *vp = NULL; nfsfh_t nfh; fhandle_t *fhp; u_int32_t *tl; caddr_t bpos; int error = 0, rdonly, getret; struct mbuf *mb, *mreq; struct vattr vattr, *vap = &vattr; u_long testmode, nfsmode; int v3 = (nfsd->nd_flag & ND_NFSV3); int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (!v3) panic("nfsrv3_access: v3 proc called on a v2 connection"); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED); error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(1, NULL); error = 0; goto nfsmout; } nfsmode = fxdr_unsigned(u_int32_t, *tl); if ((nfsmode & NFSV3ACCESS_READ) && nfsrv_access(vp, VREAD, cred, rdonly, td, 0)) nfsmode &= ~NFSV3ACCESS_READ; if (vp->v_type == VDIR) testmode = (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); else testmode = (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if ((nfsmode & testmode) && nfsrv_access(vp, VWRITE, cred, rdonly, td, 0)) nfsmode &= ~testmode; if (vp->v_type == VDIR) testmode = NFSV3ACCESS_LOOKUP; else testmode = NFSV3ACCESS_EXECUTE; if ((nfsmode & testmode) && nfsrv_access(vp, VEXEC, cred, rdonly, td, 0)) nfsmode &= ~testmode; getret = VOP_GETATTR(vp, vap, cred, td); vput(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(1) + NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, vap); tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(nfsmode); nfsmout: if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs getattr service */ int nfsrv_getattr(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nfs_fattr *fp; struct vattr va; struct vattr *vap = &va; struct vnode *vp = NULL; nfsfh_t nfh; fhandle_t *fhp; caddr_t bpos; int error = 0, rdonly; struct mbuf *mb, *mreq; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { nfsm_reply(0); error = 0; goto nfsmout; } error = VOP_GETATTR(vp, vap, cred, td); vput(vp); vp = NULL; nfsm_reply(NFSX_FATTR(nfsd->nd_flag & ND_NFSV3)); if (error) { error = 0; goto nfsmout; } fp = nfsm_build(struct nfs_fattr *, NFSX_FATTR(nfsd->nd_flag & ND_NFSV3)); nfsm_srvfillattr(vap, fp); /* fall through */ nfsmout: if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs setattr service */ int nfsrv_setattr(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct vattr va, preat; struct vattr *vap = &va; struct nfsv2_sattr *sp; struct nfs_fattr *fp; struct vnode *vp = NULL; nfsfh_t nfh; fhandle_t *fhp; u_int32_t *tl; caddr_t bpos; int error = 0, rdonly, preat_ret = 1, postat_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3), gcheck = 0; struct mbuf *mb, *mreq; struct timespec guard = { 0, 0 }; struct mount *mp = NULL; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto out; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ VATTR_NULL(vap); if (v3) { nfsm_srvsattr(vap); tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED); gcheck = fxdr_unsigned(int, *tl); if (gcheck) { tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED); fxdr_nfsv3time(tl, &guard); } } else { sp = nfsm_dissect_nonblock(struct nfsv2_sattr *, NFSX_V2SATTR); /* * Nah nah nah nah na nah * There is a bug in the Sun client that puts 0xffff in the mode * field of sattr when it should put in 0xffffffff. The u_short * doesn't sign extend. * --> check the low order 2 bytes for 0xffff */ if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff) vap->va_mode = nfstov_mode(sp->sa_mode); if (sp->sa_uid != nfsrv_nfs_xdrneg1) vap->va_uid = fxdr_unsigned(uid_t, sp->sa_uid); if (sp->sa_gid != nfsrv_nfs_xdrneg1) vap->va_gid = fxdr_unsigned(gid_t, sp->sa_gid); if (sp->sa_size != nfsrv_nfs_xdrneg1) vap->va_size = fxdr_unsigned(u_quad_t, sp->sa_size); if (sp->sa_atime.nfsv2_sec != nfsrv_nfs_xdrneg1) { #ifdef notyet fxdr_nfsv2time(&sp->sa_atime, &vap->va_atime); #else vap->va_atime.tv_sec = fxdr_unsigned(int32_t, sp->sa_atime.nfsv2_sec); vap->va_atime.tv_nsec = 0; #endif } if (sp->sa_mtime.nfsv2_sec != nfsrv_nfs_xdrneg1) fxdr_nfsv2time(&sp->sa_mtime, &vap->va_mtime); } /* * Now that we have all the fields, lets do it. */ error = nfsrv_fhtovp(fhp, 1, &vp, &tvfslocked, cred, slp, nam, &rdonly, TRUE); vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked); if (error) { nfsm_reply(2 * NFSX_UNSIGNED); if (v3) nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap); error = 0; goto nfsmout; } /* * vp now an active resource, pay careful attention to cleanup */ if (v3) { error = preat_ret = VOP_GETATTR(vp, &preat, cred, td); if (!error && gcheck && (preat.va_ctime.tv_sec != guard.tv_sec || preat.va_ctime.tv_nsec != guard.tv_nsec)) error = NFSERR_NOT_SYNC; if (error) { vput(vp); vp = NULL; nfsm_reply(NFSX_WCCDATA(v3)); if (v3) nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap); error = 0; goto nfsmout; } } /* * If the size is being changed write acces is required, otherwise * just check for a read only filesystem. */ if (vap->va_size == ((u_quad_t)((quad_t) -1))) { if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { error = EROFS; goto out; } } else { if (vp->v_type == VDIR) { error = EISDIR; goto out; } else if ((error = nfsrv_access(vp, VWRITE, cred, rdonly, td, 0)) != 0) goto out; } error = VOP_SETATTR(vp, vap, cred, td); postat_ret = VOP_GETATTR(vp, vap, cred, td); if (!error) error = postat_ret; out: if (vp != NULL) vput(vp); vp = NULL; nfsm_reply(NFSX_WCCORFATTR(v3)); if (v3) { nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap); } else if (!error) { /* v2 non-error case. */ fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } error = 0; /* fall through */ nfsmout: if (vp) vput(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs lookup rpc */ int nfsrv_lookup(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nfs_fattr *fp; struct nameidata nd, ind, *ndp = &nd; struct vnode *vp, *dirp = NULL; nfsfh_t nfh; fhandle_t *fhp; caddr_t bpos; int error = 0, len, dirattr_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3), pubflag; struct mbuf *mb, *mreq; struct vattr va, dirattr, *vap = &va; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); nfsm_srvnamesiz(len); pubflag = nfs_ispublicfh(fhp); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = LOOKUP; nd.ni_cnd.cn_flags = LOCKLEAF | SAVESTART | MPSAFE; error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirattr, &dirattr_ret, td, pubflag); vfslocked = NDHASGIANT(&nd); /* * namei failure, only dirp to cleanup. Clear out garbarge from * structure in case macros jump to nfsmout. */ if (error) { if (dirp) { vrele(dirp); dirp = NULL; } nfsm_reply(NFSX_POSTOPATTR(v3)); if (v3) nfsm_srvpostop_attr(dirattr_ret, &dirattr); error = 0; goto nfsmout; } /* * Locate index file for public filehandle * * error is 0 on entry and 0 on exit from this block. */ if (pubflag) { if (nd.ni_vp->v_type == VDIR && nfs_pub.np_index != NULL) { /* * Setup call to lookup() to see if we can find * the index file. Arguably, this doesn't belong * in a kernel.. Ugh. If an error occurs, do not * try to install an index file and then clear the * error. * * When we replace nd with ind and redirect ndp, * maintenance of ni_startdir and ni_vp shift to * ind and we have to clean them up in the old nd. * However, the cnd resource continues to be maintained * via the original nd. Confused? You aren't alone! */ ind = nd; VOP_UNLOCK(nd.ni_vp, 0, td); ind.ni_pathlen = strlen(nfs_pub.np_index); ind.ni_cnd.cn_nameptr = ind.ni_cnd.cn_pnbuf = nfs_pub.np_index; ind.ni_startdir = nd.ni_vp; VREF(ind.ni_startdir); ind.ni_cnd.cn_flags &= ~GIANTHELD; tvfslocked = VFS_LOCK_GIANT(ind.ni_startdir->v_mount); if (tvfslocked) nd.ni_cnd.cn_flags |= GIANTHELD; error = lookup(&ind); ind.ni_dvp = NULL; vfslocked = nfsrv_lockedpair_nd(vfslocked, &ind); ind.ni_cnd.cn_flags &= ~GIANTHELD; if (error == 0) { /* * Found an index file. Get rid of * the old references. transfer nd.ni_vp' */ if (dirp) vrele(dirp); dirp = nd.ni_vp; nd.ni_vp = NULL; vrele(nd.ni_startdir); nd.ni_startdir = NULL; ndp = &ind; } error = 0; } /* * If the public filehandle was used, check that this lookup * didn't result in a filehandle outside the publicly exported * filesystem. We clear the poor vp here to avoid lockups due * to NFS I/O. */ if (ndp->ni_vp->v_mount != nfs_pub.np_mount) { vput(nd.ni_vp); nd.ni_vp = NULL; error = EPERM; } } /* * Resources at this point: * ndp->ni_vp may not be NULL */ if (error) { nfsm_reply(NFSX_POSTOPATTR(v3)); if (v3) nfsm_srvpostop_attr(dirattr_ret, &dirattr); error = 0; goto nfsmout; } /* * Get underlying attribute, then release remaining resources ( for * the same potential blocking reason ) and reply. */ vp = ndp->ni_vp; bzero((caddr_t)fhp, sizeof(nfh)); fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fhp->fh_fid); if (!error) error = VOP_GETATTR(vp, vap, cred, td); vput(vp); vrele(ndp->ni_startdir); vrele(dirp); ndp->ni_vp = NULL; ndp->ni_startdir = NULL; dirp = NULL; nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPORFATTR(v3) + NFSX_POSTOPATTR(v3)); if (error) { if (v3) nfsm_srvpostop_attr(dirattr_ret, &dirattr); error = 0; goto nfsmout; } nfsm_srvfhtom(fhp, v3); if (v3) { nfsm_srvpostop_attr(0, vap); nfsm_srvpostop_attr(dirattr_ret, &dirattr); } else { fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } nfsmout: if (ndp->ni_vp || dirp || ndp->ni_startdir) { if (ndp->ni_vp) vput(ndp->ni_vp); if (dirp) vrele(dirp); if (ndp->ni_startdir) vrele(ndp->ni_startdir); } NDFREE(&nd, NDF_ONLY_PNBUF); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * nfs readlink service */ int nfsrv_readlink(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN]; struct iovec *ivp = iv; struct mbuf *mp; u_int32_t *tl; caddr_t bpos; int error = 0, rdonly, i, tlen, len, getret; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mp3, *nmp, *mreq; struct vnode *vp = NULL; struct vattr attr; nfsfh_t nfh; fhandle_t *fhp; struct uio io, *uiop = &io; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; #ifndef nolint mp = NULL; #endif mp3 = NULL; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); len = 0; i = 0; while (len < NFS_MAXPATHLEN) { MGET(nmp, M_TRYWAIT, MT_DATA); MCLGET(nmp, M_TRYWAIT); nmp->m_len = NFSMSIZ(nmp); if (len == 0) mp3 = mp = nmp; else { mp->m_next = nmp; mp = nmp; } if ((len + mp->m_len) > NFS_MAXPATHLEN) { mp->m_len = NFS_MAXPATHLEN - len; len = NFS_MAXPATHLEN; } else len += mp->m_len; ivp->iov_base = mtod(mp, caddr_t); ivp->iov_len = mp->m_len; i++; ivp++; } uiop->uio_iov = iv; uiop->uio_iovcnt = i; uiop->uio_offset = 0; uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { nfsm_reply(2 * NFSX_UNSIGNED); if (v3) nfsm_srvpostop_attr(1, NULL); error = 0; goto nfsmout; } if (vp->v_type != VLNK) { if (v3) error = EINVAL; else error = ENXIO; } else error = VOP_READLINK(vp, uiop, cred); getret = VOP_GETATTR(vp, &attr, cred, td); vput(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_UNSIGNED); if (v3) nfsm_srvpostop_attr(getret, &attr); if (error) { error = 0; goto nfsmout; } if (uiop->uio_resid > 0) { len -= uiop->uio_resid; tlen = nfsm_rndup(len); nfsm_adj(mp3, NFS_MAXPATHLEN-tlen, tlen-len); } tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(len); mb->m_next = mp3; mp3 = NULL; nfsmout: if (mp3) m_freem(mp3); if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs read service */ int nfsrv_read(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct iovec *iv; struct iovec *iv2; struct mbuf *m; struct nfs_fattr *fp; u_int32_t *tl; int i; caddr_t bpos; int error = 0, rdonly, cnt, len, left, siz, tlen, getret; int v3 = (nfsd->nd_flag & ND_NFSV3), reqlen; struct mbuf *mb, *mreq; struct mbuf *m2; struct vnode *vp = NULL; nfsfh_t nfh; fhandle_t *fhp; struct uio io, *uiop = &io; struct vattr va, *vap = &va; struct nfsheur *nh; off_t off; int ioflag = 0; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if (v3) { tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED); off = fxdr_hyper(tl); } else { tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED); off = (off_t)fxdr_unsigned(u_int32_t, *tl); } nfsm_srvstrsiz(reqlen, NFS_SRVMAXDATA(nfsd)); /* * Reference vp. If an error occurs, vp will be invalid, but we * have to NULL it just in case. The macros might goto nfsmout * as well. */ error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { vp = NULL; nfsm_reply(2 * NFSX_UNSIGNED); if (v3) nfsm_srvpostop_attr(1, NULL); error = 0; goto nfsmout; } if (vp->v_type != VREG) { if (v3) error = EINVAL; else error = (vp->v_type == VDIR) ? EISDIR : EACCES; } if (!error) { if ((error = nfsrv_access(vp, VREAD, cred, rdonly, td, 1)) != 0) error = nfsrv_access(vp, VEXEC, cred, rdonly, td, 1); } getret = VOP_GETATTR(vp, vap, cred, td); if (!error) error = getret; if (error) { vput(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(v3)); if (v3) nfsm_srvpostop_attr(getret, vap); error = 0; goto nfsmout; } /* * Calculate byte count to read */ if (off >= vap->va_size) cnt = 0; else if ((off + reqlen) > vap->va_size) cnt = vap->va_size - off; else cnt = reqlen; /* * Calculate seqcount for heuristic */ { int hi; int try = 32; /* * Locate best candidate */ hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC; nh = &nfsheur[hi]; while (try--) { if (nfsheur[hi].nh_vp == vp) { nh = &nfsheur[hi]; break; } if (nfsheur[hi].nh_use > 0) --nfsheur[hi].nh_use; hi = (hi + 1) % NUM_HEURISTIC; if (nfsheur[hi].nh_use < nh->nh_use) nh = &nfsheur[hi]; } if (nh->nh_vp != vp) { nh->nh_vp = vp; nh->nh_nextr = off; nh->nh_use = NHUSE_INIT; if (off == 0) nh->nh_seqcount = 4; else nh->nh_seqcount = 1; } /* * Calculate heuristic */ if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) { if (++nh->nh_seqcount > IO_SEQMAX) nh->nh_seqcount = IO_SEQMAX; } else if (nh->nh_seqcount > 1) { nh->nh_seqcount = 1; } else { nh->nh_seqcount = 0; } nh->nh_use += NHUSE_INC; if (nh->nh_use > NHUSE_MAX) nh->nh_use = NHUSE_MAX; ioflag |= nh->nh_seqcount << IO_SEQSHIFT; } nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt)); if (v3) { tl = nfsm_build(u_int32_t *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED); *tl++ = nfsrv_nfs_true; fp = (struct nfs_fattr *)tl; tl += (NFSX_V3FATTR / sizeof (u_int32_t)); } else { tl = nfsm_build(u_int32_t *, NFSX_V2FATTR + NFSX_UNSIGNED); fp = (struct nfs_fattr *)tl; tl += (NFSX_V2FATTR / sizeof (u_int32_t)); } len = left = nfsm_rndup(cnt); if (cnt > 0) { /* * Generate the mbuf list with the uio_iov ref. to it. */ i = 0; m = m2 = mb; while (left > 0) { siz = min(M_TRAILINGSPACE(m), left); if (siz > 0) { left -= siz; i++; } if (left > 0) { MGET(m, M_TRYWAIT, MT_DATA); MCLGET(m, M_TRYWAIT); m->m_len = 0; m2->m_next = m; m2 = m; } } MALLOC(iv, struct iovec *, i * sizeof (struct iovec), M_TEMP, M_WAITOK); uiop->uio_iov = iv2 = iv; m = mb; left = len; i = 0; while (left > 0) { if (m == NULL) panic("nfsrv_read iov"); siz = min(M_TRAILINGSPACE(m), left); if (siz > 0) { iv->iov_base = mtod(m, caddr_t) + m->m_len; iv->iov_len = siz; m->m_len += siz; left -= siz; iv++; i++; } m = m->m_next; } uiop->uio_iovcnt = i; uiop->uio_offset = off; uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred); off = uiop->uio_offset; nh->nh_nextr = off; FREE((caddr_t)iv2, M_TEMP); if (error || (getret = VOP_GETATTR(vp, vap, cred, td))) { if (!error) error = getret; m_freem(mreq); vput(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(v3)); if (v3) nfsm_srvpostop_attr(getret, vap); error = 0; goto nfsmout; } } else uiop->uio_resid = 0; vput(vp); vp = NULL; nfsm_srvfillattr(vap, fp); tlen = len - uiop->uio_resid; cnt = cnt < tlen ? cnt : tlen; tlen = nfsm_rndup(cnt); if (len != tlen || tlen != cnt) nfsm_adj(mb, len - tlen, tlen - cnt); if (v3) { *tl++ = txdr_unsigned(cnt); if (cnt < reqlen) *tl++ = nfsrv_nfs_true; else *tl++ = nfsrv_nfs_false; } *tl = txdr_unsigned(cnt); nfsmout: if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs write service */ int nfsrv_write(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct iovec *ivp; int i, cnt; struct mbuf *mp; struct nfs_fattr *fp; struct iovec *iv; struct vattr va, forat; struct vattr *vap = &va; u_int32_t *tl; caddr_t bpos; int error = 0, rdonly, len, forat_ret = 1; int ioflags, aftat_ret = 1, retlen = 0, zeroing, adjust; int stable = NFSV3WRITE_FILESYNC; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *vp = NULL; nfsfh_t nfh; fhandle_t *fhp; struct uio io, *uiop = &io; off_t off; struct mount *mntp = NULL; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; if (mrep == NULL) { *mrq = NULL; error = 0; goto nfsmout; } fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mntp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto ereply; } vfslocked = VFS_LOCK_GIANT(mntp); (void) vn_start_write(NULL, &mntp, V_WAIT); vfs_rel(mntp); /* The write holds a ref. */ if (v3) { tl = nfsm_dissect_nonblock(u_int32_t *, 5 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 3; stable = fxdr_unsigned(int, *tl++); } else { tl = nfsm_dissect_nonblock(u_int32_t *, 4 * NFSX_UNSIGNED); off = (off_t)fxdr_unsigned(u_int32_t, *++tl); tl += 2; if (nfs_async) stable = NFSV3WRITE_UNSTABLE; } retlen = len = fxdr_unsigned(int32_t, *tl); cnt = i = 0; /* * For NFS Version 2, it is not obvious what a write of zero length * should do, but I might as well be consistent with Version 3, * which is to return ok so long as there are no permission problems. */ if (len > 0) { zeroing = 1; mp = mrep; while (mp) { if (mp == md) { zeroing = 0; adjust = dpos - mtod(mp, caddr_t); mp->m_len -= adjust; if (mp->m_len > 0 && adjust > 0) mp->m_data += adjust; } if (zeroing) mp->m_len = 0; else if (mp->m_len > 0) { i += mp->m_len; if (i > len) { mp->m_len -= (i - len); zeroing = 1; } if (mp->m_len > 0) cnt++; } mp = mp->m_next; } } if (len > NFS_MAXDATA || len < 0 || i < len) { error = EIO; nfsm_reply(2 * NFSX_UNSIGNED); if (v3) nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); error = 0; goto nfsmout; } error = nfsrv_fhtovp(fhp, 1, &vp, &tvfslocked, cred, slp, nam, &rdonly, TRUE); vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked); if (error) { vp = NULL; nfsm_reply(2 * NFSX_UNSIGNED); if (v3) nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); error = 0; goto nfsmout; } if (v3) forat_ret = VOP_GETATTR(vp, &forat, cred, td); if (vp->v_type != VREG) { if (v3) error = EINVAL; else error = (vp->v_type == VDIR) ? EISDIR : EACCES; } if (!error) error = nfsrv_access(vp, VWRITE, cred, rdonly, td, 1); if (error) { vput(vp); vp = NULL; nfsm_reply(NFSX_WCCDATA(v3)); if (v3) nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); error = 0; goto nfsmout; } if (len > 0) { MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP, M_WAITOK); uiop->uio_iov = iv = ivp; uiop->uio_iovcnt = cnt; mp = mrep; while (mp) { if (mp->m_len > 0) { ivp->iov_base = mtod(mp, caddr_t); ivp->iov_len = mp->m_len; ivp++; } mp = mp->m_next; } /* * XXX * The IO_METASYNC flag indicates that all metadata (and not just * enough to ensure data integrity) mus be written to stable storage * synchronously. * (IO_METASYNC is not yet implemented in 4.4BSD-Lite.) */ if (stable == NFSV3WRITE_UNSTABLE) ioflags = IO_NODELOCKED; else if (stable == NFSV3WRITE_DATASYNC) ioflags = (IO_SYNC | IO_NODELOCKED); else ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED); uiop->uio_resid = len; uiop->uio_rw = UIO_WRITE; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; uiop->uio_offset = off; error = VOP_WRITE(vp, uiop, ioflags, cred); /* XXXRW: unlocked write. */ nfsrvstats.srvvop_writes++; FREE((caddr_t)iv, M_TEMP); } aftat_ret = VOP_GETATTR(vp, vap, cred, td); vput(vp); vp = NULL; if (!error) error = aftat_ret; ereply: nfsm_reply(NFSX_PREOPATTR(v3) + NFSX_POSTOPORFATTR(v3) + 2 * NFSX_UNSIGNED + NFSX_WRITEVERF(v3)); if (v3) { nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); if (error) { error = 0; goto nfsmout; } tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(retlen); /* * If nfs_async is set, then pretend the write was FILESYNC. */ if (stable == NFSV3WRITE_UNSTABLE && !nfs_async) *tl++ = txdr_unsigned(stable); else *tl++ = txdr_unsigned(NFSV3WRITE_FILESYNC); /* * Actually, there is no need to txdr these fields, * but it may make the values more human readable, * for debugging purposes. */ if (nfsver.tv_sec == 0) nfsver = boottime; *tl++ = txdr_unsigned(nfsver.tv_sec); *tl = txdr_unsigned(nfsver.tv_usec); } else if (!error) { /* v2 non-error case. */ fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } error = 0; nfsmout: if (vp) vput(vp); vn_finished_write(mntp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * For the purposes of write gathering, we must decide if the credential * associated with two pending requests have equivilent privileges. Since * NFS only uses a subset of the BSD ucred -- the effective uid and group * IDs -- we have a compare routine that checks only the relevant fields. */ static int nfsrv_samecred(struct ucred *cr1, struct ucred *cr2) { int i; if (cr1->cr_uid != cr2->cr_uid) return (0); if (cr1->cr_ngroups != cr2->cr_ngroups) return (0); for (i = 0; i < cr1->cr_ngroups; i++) { if (cr1->cr_groups[i] != cr2->cr_groups[i]) return (0); } return (1); } /* * NFS write service with write gathering support. Called when * nfsrvw_procrastinate > 0. * See: Chet Juszczak, "Improving the Write Performance of an NFS Server", * in Proc. of the Winter 1994 Usenix Conference, pg. 247-259, San Franscisco, * Jan. 1994. */ int nfsrv_writegather(struct nfsrv_descript **ndp, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct iovec *ivp; struct mbuf *mp; struct nfsrv_descript *wp, *nfsd, *owp, *swp; struct nfs_fattr *fp; int i; struct iovec *iov; struct nfsrvw_delayhash *wpp; struct ucred *cred; struct vattr va, forat; u_int32_t *tl; caddr_t bpos, dpos; int error = 0, rdonly, len, forat_ret = 1; int ioflags, aftat_ret = 1, s, adjust, v3, zeroing; struct mbuf *mb, *mreq, *mrep, *md; struct vnode *vp = NULL; struct uio io, *uiop = &io; u_quad_t cur_usec; struct mount *mntp = NULL; int mvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint i = 0; len = 0; #endif *mrq = NULL; if (*ndp) { nfsd = *ndp; *ndp = NULL; mrep = nfsd->nd_mrep; md = nfsd->nd_md; dpos = nfsd->nd_dpos; cred = nfsd->nd_cr; v3 = (nfsd->nd_flag & ND_NFSV3); LIST_INIT(&nfsd->nd_coalesce); nfsd->nd_mreq = NULL; nfsd->nd_stable = NFSV3WRITE_FILESYNC; cur_usec = nfs_curusec(); nfsd->nd_time = cur_usec + (v3 ? nfsrvw_procrastinate_v3 : nfsrvw_procrastinate); /* * Now, get the write header.. */ nfsm_srvmtofh(&nfsd->nd_fh); if (v3) { tl = nfsm_dissect_nonblock(u_int32_t *, 5 * NFSX_UNSIGNED); nfsd->nd_off = fxdr_hyper(tl); tl += 3; nfsd->nd_stable = fxdr_unsigned(int, *tl++); } else { tl = nfsm_dissect_nonblock(u_int32_t *, 4 * NFSX_UNSIGNED); nfsd->nd_off = (off_t)fxdr_unsigned(u_int32_t, *++tl); tl += 2; if (nfs_async) nfsd->nd_stable = NFSV3WRITE_UNSTABLE; } len = fxdr_unsigned(int32_t, *tl); nfsd->nd_len = len; nfsd->nd_eoff = nfsd->nd_off + len; /* * Trim the header out of the mbuf list and trim off any trailing * junk so that the mbuf list has only the write data. */ zeroing = 1; i = 0; mp = mrep; while (mp) { if (mp == md) { zeroing = 0; adjust = dpos - mtod(mp, caddr_t); mp->m_len -= adjust; if (mp->m_len > 0 && adjust > 0) mp->m_data += adjust; } if (zeroing) mp->m_len = 0; else { i += mp->m_len; if (i > len) { mp->m_len -= (i - len); zeroing = 1; } } mp = mp->m_next; } if (len > NFS_MAXDATA || len < 0 || i < len) { nfsmout: m_freem(mrep); error = EIO; nfsm_writereply(2 * NFSX_UNSIGNED); if (v3) nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va); nfsd->nd_mreq = mreq; nfsd->nd_mrep = NULL; nfsd->nd_time = 0; } /* * Add this entry to the hash and time queues. */ s = splsoftclock(); owp = NULL; wp = LIST_FIRST(&slp->ns_tq); while (wp && wp->nd_time < nfsd->nd_time) { owp = wp; wp = LIST_NEXT(wp, nd_tq); } NFS_DPF(WG, ("Q%03x", nfsd->nd_retxid & 0xfff)); if (owp) { LIST_INSERT_AFTER(owp, nfsd, nd_tq); } else { LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq); } if (nfsd->nd_mrep) { wpp = NWDELAYHASH(slp, nfsd->nd_fh.fh_fid.fid_data); owp = NULL; wp = LIST_FIRST(wpp); while (wp && bcmp((caddr_t)&nfsd->nd_fh,(caddr_t)&wp->nd_fh, NFSX_V3FH)){ owp = wp; wp = LIST_NEXT(wp, nd_hash); } while (wp && wp->nd_off < nfsd->nd_off && !bcmp((caddr_t)&nfsd->nd_fh,(caddr_t)&wp->nd_fh, NFSX_V3FH)) { owp = wp; wp = LIST_NEXT(wp, nd_hash); } if (owp) { LIST_INSERT_AFTER(owp, nfsd, nd_hash); /* * Search the hash list for overlapping entries and * coalesce. */ for(; nfsd && NFSW_CONTIG(owp, nfsd); nfsd = wp) { wp = LIST_NEXT(nfsd, nd_hash); if (nfsrv_samecred(owp->nd_cr, nfsd->nd_cr)) nfsrvw_coalesce(owp, nfsd); } } else { LIST_INSERT_HEAD(wpp, nfsd, nd_hash); } } splx(s); } /* * Now, do VOP_WRITE()s for any one(s) that need to be done now * and generate the associated reply mbuf list(s). */ loop1: cur_usec = nfs_curusec(); s = splsoftclock(); for (nfsd = LIST_FIRST(&slp->ns_tq); nfsd; nfsd = owp) { owp = LIST_NEXT(nfsd, nd_tq); if (nfsd->nd_time > cur_usec) break; if (nfsd->nd_mreq) continue; NFS_DPF(WG, ("P%03x", nfsd->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_tq); LIST_REMOVE(nfsd, nd_hash); splx(s); mrep = nfsd->nd_mrep; nfsd->nd_mrep = NULL; cred = nfsd->nd_cr; v3 = (nfsd->nd_flag & ND_NFSV3); forat_ret = aftat_ret = 1; error = nfsrv_fhtovp(&nfsd->nd_fh, 1, &vp, &vfslocked, cred, slp, nfsd->nd_nam, &rdonly, TRUE); if (!error) { if (v3) forat_ret = VOP_GETATTR(vp, &forat, cred, td); if (vp->v_type != VREG) { if (v3) error = EINVAL; else error = (vp->v_type == VDIR) ? EISDIR : EACCES; } } else { vp = NULL; } if (!error) error = nfsrv_access(vp, VWRITE, cred, rdonly, td, 1); if (nfsd->nd_stable == NFSV3WRITE_UNSTABLE) ioflags = IO_NODELOCKED; else if (nfsd->nd_stable == NFSV3WRITE_DATASYNC) ioflags = (IO_SYNC | IO_NODELOCKED); else ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED); uiop->uio_rw = UIO_WRITE; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; uiop->uio_offset = nfsd->nd_off; uiop->uio_resid = nfsd->nd_eoff - nfsd->nd_off; if (uiop->uio_resid > 0) { mp = mrep; i = 0; while (mp) { if (mp->m_len > 0) i++; mp = mp->m_next; } uiop->uio_iovcnt = i; MALLOC(iov, struct iovec *, i * sizeof (struct iovec), M_TEMP, M_WAITOK); uiop->uio_iov = ivp = iov; mp = mrep; while (mp) { if (mp->m_len > 0) { ivp->iov_base = mtod(mp, caddr_t); ivp->iov_len = mp->m_len; ivp++; } mp = mp->m_next; } mvfslocked = 0; if (!error) { if (vn_start_write(vp, &mntp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0, td); error = vn_start_write(NULL, &mntp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } mvfslocked = VFS_LOCK_GIANT(mntp); } if (!error) { error = VOP_WRITE(vp, uiop, ioflags, cred); /* XXXRW: unlocked write. */ nfsrvstats.srvvop_writes++; vn_finished_write(mntp); } VFS_UNLOCK_GIANT(mvfslocked); FREE((caddr_t)iov, M_TEMP); } m_freem(mrep); if (vp) { aftat_ret = VOP_GETATTR(vp, &va, cred, td); vput(vp); vp = NULL; } VFS_UNLOCK_GIANT(vfslocked); /* * Loop around generating replies for all write rpcs that have * now been completed. */ swp = nfsd; do { NFS_DPF(WG, ("R%03x", nfsd->nd_retxid & 0xfff)); if (error) { nfsm_writereply(NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va); } } else { nfsm_writereply(NFSX_PREOPATTR(v3) + NFSX_POSTOPORFATTR(v3) + 2 * NFSX_UNSIGNED + NFSX_WRITEVERF(v3)); if (v3) { nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va); tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(nfsd->nd_len); *tl++ = txdr_unsigned(swp->nd_stable); /* * Actually, there is no need to txdr these fields, * but it may make the values more human readable, * for debugging purposes. */ if (nfsver.tv_sec == 0) nfsver = boottime; *tl++ = txdr_unsigned(nfsver.tv_sec); *tl = txdr_unsigned(nfsver.tv_usec); } else { fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(&va, fp); } } nfsd->nd_mreq = mreq; if (nfsd->nd_mrep) panic("nfsrv_write: nd_mrep not free"); /* * Done. Put it at the head of the timer queue so that * the final phase can return the reply. */ s = splsoftclock(); if (nfsd != swp) { nfsd->nd_time = 0; LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq); } nfsd = LIST_FIRST(&swp->nd_coalesce); if (nfsd) { LIST_REMOVE(nfsd, nd_tq); } splx(s); } while (nfsd); s = splsoftclock(); swp->nd_time = 0; LIST_INSERT_HEAD(&slp->ns_tq, swp, nd_tq); splx(s); goto loop1; } splx(s); /* * Search for a reply to return. */ s = splsoftclock(); LIST_FOREACH(nfsd, &slp->ns_tq, nd_tq) if (nfsd->nd_mreq) { NFS_DPF(WG, ("X%03x", nfsd->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_tq); *mrq = nfsd->nd_mreq; *ndp = nfsd; break; } splx(s); return (0); } /* * Coalesce the write request nfsd into owp. To do this we must: * - remove nfsd from the queues * - merge nfsd->nd_mrep into owp->nd_mrep * - update the nd_eoff and nd_stable for owp * - put nfsd on owp's nd_coalesce list * NB: Must be called at splsoftclock(). */ static void nfsrvw_coalesce(struct nfsrv_descript *owp, struct nfsrv_descript *nfsd) { int overlap; struct mbuf *mp; struct nfsrv_descript *p; NFS_DPF(WG, ("C%03x-%03x", nfsd->nd_retxid & 0xfff, owp->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_hash); LIST_REMOVE(nfsd, nd_tq); if (owp->nd_eoff < nfsd->nd_eoff) { overlap = owp->nd_eoff - nfsd->nd_off; if (overlap < 0) panic("nfsrv_coalesce: bad off"); if (overlap > 0) m_adj(nfsd->nd_mrep, overlap); mp = owp->nd_mrep; while (mp->m_next) mp = mp->m_next; mp->m_next = nfsd->nd_mrep; owp->nd_eoff = nfsd->nd_eoff; } else m_freem(nfsd->nd_mrep); nfsd->nd_mrep = NULL; if (nfsd->nd_stable == NFSV3WRITE_FILESYNC) owp->nd_stable = NFSV3WRITE_FILESYNC; else if (nfsd->nd_stable == NFSV3WRITE_DATASYNC && owp->nd_stable == NFSV3WRITE_UNSTABLE) owp->nd_stable = NFSV3WRITE_DATASYNC; LIST_INSERT_HEAD(&owp->nd_coalesce, nfsd, nd_tq); /* * If nfsd had anything else coalesced into it, transfer them * to owp, otherwise their replies will never get sent. */ for (p = LIST_FIRST(&nfsd->nd_coalesce); p; p = LIST_FIRST(&nfsd->nd_coalesce)) { LIST_REMOVE(p, nd_tq); LIST_INSERT_HEAD(&owp->nd_coalesce, p, nd_tq); } } /* * nfs create service * now does a truncate to 0 length via. setattr if it already exists */ int nfsrv_create(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nfs_fattr *fp; struct vattr va, dirfor, diraft; struct vattr *vap = &va; struct nfsv2_sattr *sp; u_int32_t *tl; struct nameidata nd; caddr_t bpos; int error = 0, rdev, len, tsize, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3), how, exclusive_flag = 0; caddr_t cp; struct mbuf *mb, *mreq; struct vnode *dirp = NULL; nfsfh_t nfh; fhandle_t *fhp; u_quad_t tempsize; u_char cverf[NFSX_V3CREATEVERF]; struct mount *mp = NULL; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; #ifndef nolint rdev = 0; #endif ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto ereply; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE; /* * Call namei and do initial cleanup to get a few things * out of the way. If we get an initial error we cleanup * and return here to avoid special-casing the invalid nd * structure through the rest of the case. dirp may be * set even if an error occurs, but the nd structure will not * be valid at all if an error occurs so we have to invalidate it * prior to calling nfsm_reply ( which might goto nfsmout ). */ error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); if (dirp && !v3) { vrele(dirp); dirp = NULL; } if (error) { nfsm_reply(NFSX_WCCDATA(v3)); if (v3) nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; goto nfsmout; } /* * No error. Continue. State: * * startdir is valid ( we release this immediately ) * dirp may be valid * nd.ni_vp may be valid * nd.ni_dvp is valid * * The error state is set through the code and we may also do some * opportunistic releasing of vnodes to avoid holding locks through * NFS I/O. The cleanup at the end is a catch-all */ VATTR_NULL(vap); if (v3) { tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED); how = fxdr_unsigned(int, *tl); switch (how) { case NFSV3CREATE_GUARDED: if (nd.ni_vp) { error = EEXIST; break; } /* fall through */ case NFSV3CREATE_UNCHECKED: nfsm_srvsattr(vap); break; case NFSV3CREATE_EXCLUSIVE: cp = nfsm_dissect_nonblock(caddr_t, NFSX_V3CREATEVERF); bcopy(cp, cverf, NFSX_V3CREATEVERF); exclusive_flag = 1; break; }; vap->va_type = VREG; } else { sp = nfsm_dissect_nonblock(struct nfsv2_sattr *, NFSX_V2SATTR); vap->va_type = IFTOVT(fxdr_unsigned(u_int32_t, sp->sa_mode)); if (vap->va_type == VNON) vap->va_type = VREG; vap->va_mode = nfstov_mode(sp->sa_mode); switch (vap->va_type) { case VREG: tsize = fxdr_unsigned(int32_t, sp->sa_size); if (tsize != -1) vap->va_size = (u_quad_t)tsize; break; case VCHR: case VBLK: case VFIFO: rdev = fxdr_unsigned(long, sp->sa_size); break; default: break; }; } /* * Iff doesn't exist, create it * otherwise just truncate to 0 length * should I set the mode too ? * * The only possible error we can have at this point is EEXIST. * nd.ni_vp will also be non-NULL in that case. */ if (nd.ni_vp == NULL) { if (vap->va_mode == (mode_t)VNOVAL) vap->va_mode = 0; if (vap->va_type == VREG || vap->va_type == VSOCK) { error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); if (error) NDFREE(&nd, NDF_ONLY_PNBUF); else { if (exclusive_flag) { exclusive_flag = 0; VATTR_NULL(vap); bcopy(cverf, (caddr_t)&vap->va_atime, NFSX_V3CREATEVERF); error = VOP_SETATTR(nd.ni_vp, vap, cred, td); } } } else if (vap->va_type == VCHR || vap->va_type == VBLK || vap->va_type == VFIFO) { /* * NFSv2-specific code for creating device nodes * and fifos. * * Handle SysV FIFO node special cases. All other * devices require super user to access. */ if (vap->va_type == VCHR && rdev == 0xffffffff) vap->va_type = VFIFO; if (vap->va_type != VFIFO && (error = suser_cred(cred, 0))) { goto ereply; } vap->va_rdev = rdev; error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); goto ereply; } vput(nd.ni_vp); nd.ni_vp = NULL; /* * release dvp prior to lookup */ vput(nd.ni_dvp); nd.ni_dvp = NULL; /* * Setup for lookup. * * Even though LOCKPARENT was cleared, ni_dvp may * be garbage. */ nd.ni_cnd.cn_nameiop = LOOKUP; nd.ni_cnd.cn_flags &= ~(LOCKPARENT); nd.ni_cnd.cn_thread = td; nd.ni_cnd.cn_cred = cred; tvfslocked = VFS_LOCK_GIANT(nd.ni_startdir->v_mount); if (tvfslocked) nd.ni_cnd.cn_flags |= GIANTHELD; error = lookup(&nd); nd.ni_dvp = NULL; vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); nd.ni_cnd.cn_flags &= ~GIANTHELD; if (error) goto ereply; if (nd.ni_cnd.cn_flags & ISSYMLINK) { error = EINVAL; goto ereply; } } else { error = ENXIO; } } else { if (vap->va_size != -1) { error = nfsrv_access(nd.ni_vp, VWRITE, cred, (nd.ni_cnd.cn_flags & RDONLY), td, 0); if (!error) { tempsize = vap->va_size; VATTR_NULL(vap); vap->va_size = tempsize; error = VOP_SETATTR(nd.ni_vp, vap, cred, td); } } } if (!error) { bzero((caddr_t)fhp, sizeof(nfh)); fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(nd.ni_vp, &fhp->fh_fid); if (!error) error = VOP_GETATTR(nd.ni_vp, vap, cred, td); } if (v3) { if (exclusive_flag && !error && bcmp(cverf, (caddr_t)&vap->va_atime, NFSX_V3CREATEVERF)) error = EEXIST; if (dirp == nd.ni_dvp) diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); else { /* Drop the other locks to avoid deadlock. */ if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); nd.ni_dvp = NULL; nd.ni_vp = NULL; - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } } ereply: nfsm_reply(NFSX_SRVFH(v3) + NFSX_FATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { if (!error) { nfsm_srvpostop_fh(fhp); nfsm_srvpostop_attr(0, vap); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } else if (!error) { /* v2 non-error case. */ nfsm_srvfhtom(fhp, v3); fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } error = 0; nfsmout: if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); if (nd.ni_startdir) { vrele(nd.ni_startdir); nd.ni_startdir = NULL; } if (dirp) vrele(dirp); NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * nfs v3 mknod service */ int nfsrv_mknod(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct vattr va, dirfor, diraft; struct vattr *vap = &va; u_int32_t *tl; struct nameidata nd; caddr_t bpos; int error = 0, len, dirfor_ret = 1, diraft_ret = 1; u_int32_t major, minor; enum vtype vtyp; struct mbuf *mb, *mreq; struct vnode *vp, *dirp = NULL; nfsfh_t nfh; fhandle_t *fhp; struct mount *mp = NULL; int v3 = (nfsd->nd_flag & ND_NFSV3); int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; if (!v3) panic("nfsrv_mknod: v3 proc called on a v2 connection"); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto ereply; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE; /* * Handle nfs_namei() call. If an error occurs, the nd structure * is not valid. However, nfsm_*() routines may still jump to * nfsmout. */ error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); if (error) { nfsm_reply(NFSX_WCCDATA(1)); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; goto nfsmout; } tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED); vtyp = nfsv3tov_type(*tl); if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) { error = NFSERR_BADTYPE; goto out; } VATTR_NULL(vap); nfsm_srvsattr(vap); if (vtyp == VCHR || vtyp == VBLK) { tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED); major = fxdr_unsigned(u_int32_t, *tl++); minor = fxdr_unsigned(u_int32_t, *tl); vap->va_rdev = makedev(major, minor); } /* * Iff doesn't exist, create it. */ if (nd.ni_vp) { error = EEXIST; goto out; } vap->va_type = vtyp; if (vap->va_mode == (mode_t)VNOVAL) vap->va_mode = 0; if (vtyp == VSOCK) { vrele(nd.ni_startdir); nd.ni_startdir = NULL; error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); if (error) NDFREE(&nd, NDF_ONLY_PNBUF); } else { if (vtyp != VFIFO && (error = suser_cred(cred, 0))) goto out; error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); goto out; } vput(nd.ni_vp); nd.ni_vp = NULL; /* * Release dvp prior to lookup */ vput(nd.ni_dvp); nd.ni_dvp = NULL; nd.ni_cnd.cn_nameiop = LOOKUP; nd.ni_cnd.cn_flags &= ~(LOCKPARENT); nd.ni_cnd.cn_thread = td; nd.ni_cnd.cn_cred = td->td_ucred; tvfslocked = VFS_LOCK_GIANT(nd.ni_startdir->v_mount); if (tvfslocked) nd.ni_cnd.cn_flags |= GIANTHELD; error = lookup(&nd); nd.ni_dvp = NULL; vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); nd.ni_cnd.cn_flags &= ~GIANTHELD; if (error) goto out; if (nd.ni_cnd.cn_flags & ISSYMLINK) error = EINVAL; } /* * send response, cleanup, return. */ out: vp = nd.ni_vp; if (!error) { bzero((caddr_t)fhp, sizeof(nfh)); fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fhp->fh_fid); if (!error) error = VOP_GETATTR(vp, vap, cred, td); } if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); nd.ni_dvp = NULL; } if (vp) { vput(vp); vp = NULL; nd.ni_vp = NULL; } if (nd.ni_startdir) { vrele(nd.ni_startdir); nd.ni_startdir = NULL; } NDFREE(&nd, NDF_ONLY_PNBUF); if (dirp) { - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } ereply: nfsm_reply(NFSX_SRVFH(1) + NFSX_POSTOPATTR(1) + NFSX_WCCDATA(1)); if (v3) { if (!error) { nfsm_srvpostop_fh(fhp); nfsm_srvpostop_attr(0, vap); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (0); nfsmout: if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); if (dirp) vrele(dirp); if (nd.ni_startdir) vrele(nd.ni_startdir); NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * nfs remove service */ int nfsrv_remove(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nameidata nd; caddr_t bpos; int error = 0, len, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *dirp; struct vattr dirfor, diraft; nfsfh_t nfh; fhandle_t *fhp; struct mount *mp = NULL; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto ereply; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | MPSAFE; error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = NDHASGIANT(&nd); if (dirp && !v3) { vrele(dirp); dirp = NULL; } if (error == 0) { if (nd.ni_vp->v_type == VDIR) { error = EPERM; /* POSIX */ goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (nd.ni_vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } out: if (!error) { error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); } } if (dirp && v3) { if (dirp == nd.ni_dvp) diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); else { /* Drop the other locks to avoid deadlock. */ if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); nd.ni_dvp = NULL; nd.ni_vp = NULL; - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } vrele(dirp); dirp = NULL; } ereply: nfsm_reply(NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; } nfsmout: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs rename service */ int nfsrv_rename(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; caddr_t bpos; int error = 0, len, len2, fdirfor_ret = 1, fdiraft_ret = 1; int tdirfor_ret = 1, tdiraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct nameidata fromnd, tond; struct vnode *fvp, *tvp, *tdvp, *fdirp = NULL; struct vnode *tdirp = NULL; struct vattr fdirfor, fdiraft, tdirfor, tdiraft; nfsfh_t fnfh, tnfh; fhandle_t *ffhp, *tfhp; uid_t saved_uid; struct mount *mp = NULL; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; #ifndef nolint fvp = NULL; #endif ffhp = &fnfh.fh_generic; tfhp = &tnfh.fh_generic; /* * Clear fields incase goto nfsmout occurs from macro. */ ndclear(&fromnd); ndclear(&tond); nfsm_srvmtofh(ffhp); if ((mp = vfs_getvfs(&ffhp->fh_fsid)) == NULL) { error = ESTALE; goto out1; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); /* * Remember our original uid so that we can reset cr_uid before * the second nfs_namei() call, in case it is remapped. */ saved_uid = cred->cr_uid; fromnd.ni_cnd.cn_cred = cred; fromnd.ni_cnd.cn_nameiop = DELETE; fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART | MPSAFE; error = nfs_namei(&fromnd, ffhp, len, slp, nam, &md, &dpos, &fdirp, v3, &fdirfor, &fdirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &fromnd); if (fdirp && !v3) { vrele(fdirp); fdirp = NULL; } if (error) { nfsm_reply(2 * NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); } error = 0; goto nfsmout; } fvp = fromnd.ni_vp; nfsm_srvmtofh(tfhp); nfsm_srvnamesiz(len2); cred->cr_uid = saved_uid; tond.ni_cnd.cn_cred = cred; tond.ni_cnd.cn_nameiop = RENAME; tond.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | MPSAFE; error = nfs_namei(&tond, tfhp, len2, slp, nam, &md, &dpos, &tdirp, v3, &tdirfor, &tdirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &tond); if (tdirp && !v3) { vrele(tdirp); tdirp = NULL; } if (error) goto out1; tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { if (v3) error = EEXIST; else error = EISDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { if (v3) error = EEXIST; else error = ENOTDIR; goto out; } if (tvp->v_type == VDIR && tvp->v_mountedhere) { if (v3) error = EXDEV; else error = ENOTEMPTY; goto out; } } if (fvp->v_type == VDIR && fvp->v_mountedhere) { if (v3) error = EXDEV; else error = ENOTEMPTY; goto out; } if (fvp->v_mount != tdvp->v_mount) { if (v3) error = EXDEV; else error = ENOTEMPTY; goto out; } if (fvp == tdvp) { if (v3) error = EINVAL; else error = ENOTEMPTY; } /* * If source is the same as the destination (that is the * same vnode with the same name in the same directory), * then there is nothing to do. */ if (fvp == tvp && fromnd.ni_dvp == tdvp && fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, fromnd.ni_cnd.cn_namelen)) error = -1; out: if (!error) { /* * The VOP_RENAME function releases all vnode references & * locks prior to returning so we need to clear the pointers * to bypass cleanup code later on. */ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); fromnd.ni_dvp = NULL; fromnd.ni_vp = NULL; tond.ni_dvp = NULL; tond.ni_vp = NULL; if (error) { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } } else { if (error == -1) error = 0; } /* fall through */ out1: nfsm_reply(2 * NFSX_WCCDATA(v3)); if (v3) { /* Release existing locks to prevent deadlock. */ if (tond.ni_dvp) { if (tond.ni_dvp == tond.ni_vp) vrele(tond.ni_dvp); else vput(tond.ni_dvp); } if (tond.ni_vp) vput(tond.ni_vp); tond.ni_dvp = NULL; tond.ni_vp = NULL; if (fdirp) { - vn_lock(fdirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(fdirp, LK_EXCLUSIVE | LK_RETRY); fdiraft_ret = VOP_GETATTR(fdirp, &fdiraft, cred, td); VOP_UNLOCK(fdirp, 0, td); } if (tdirp) { - vn_lock(tdirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(tdirp, LK_EXCLUSIVE | LK_RETRY); tdiraft_ret = VOP_GETATTR(tdirp, &tdiraft, cred, td); VOP_UNLOCK(tdirp, 0, td); } nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); } error = 0; /* fall through */ nfsmout: /* * Clear out tond related fields */ if (tond.ni_dvp) { if (tond.ni_dvp == tond.ni_vp) vrele(tond.ni_dvp); else vput(tond.ni_dvp); } if (tond.ni_vp) vput(tond.ni_vp); if (tdirp) vrele(tdirp); if (tond.ni_startdir) vrele(tond.ni_startdir); NDFREE(&tond, NDF_ONLY_PNBUF); /* * Clear out fromnd related fields */ if (fdirp) vrele(fdirp); if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); NDFREE(&fromnd, NDF_ONLY_PNBUF); if (fromnd.ni_dvp) vrele(fromnd.ni_dvp); if (fromnd.ni_vp) vrele(fromnd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * nfs link service */ int nfsrv_link(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nameidata nd; caddr_t bpos; int error = 0, rdonly, len, dirfor_ret = 1, diraft_ret = 1; int getret = 1, v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *vp = NULL, *xp, *dirp = NULL; struct vattr dirfor, diraft, at; nfsfh_t nfh, dnfh; fhandle_t *fhp, *dfhp; struct mount *mp = NULL; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); vfslocked = 0; fhp = &nfh.fh_generic; dfhp = &dnfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto ereply; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvmtofh(dfhp); nfsm_srvnamesiz(len); error = nfsrv_fhtovp(fhp, TRUE, &vp, &tvfslocked, cred, slp, nam, &rdonly, TRUE); vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked); if (error) { nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { nfsm_srvpostop_attr(getret, &at); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } vp = NULL; error = 0; goto nfsmout; } if (v3) getret = VOP_GETATTR(vp, &at, cred, td); if (vp->v_type == VDIR) { error = EPERM; /* POSIX */ goto out1; } VOP_UNLOCK(vp, 0, td); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | MPSAFE | MPSAFE; error = nfs_namei(&nd, dfhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); if (dirp && !v3) { vrele(dirp); dirp = NULL; } if (error) { vrele(vp); vp = NULL; goto out2; } xp = nd.ni_vp; if (xp != NULL) { error = EEXIST; vrele(vp); vp = NULL; goto out2; } xp = nd.ni_dvp; if (vp->v_mount != xp->v_mount) { error = EXDEV; vrele(vp); vp = NULL; goto out2; } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); /* fall through */ out1: if (v3) getret = VOP_GETATTR(vp, &at, cred, td); out2: if (dirp) { if (dirp == nd.ni_dvp) diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); else { /* Release existing locks to prevent deadlock. */ if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vrele(nd.ni_vp); nd.ni_dvp = NULL; nd.ni_vp = NULL; - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } } ereply: nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { nfsm_srvpostop_attr(getret, &at); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; } /* fall through */ nfsmout: NDFREE(&nd, NDF_ONLY_PNBUF); if (vp) vput(vp); if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (dirp) vrele(dirp); if (nd.ni_vp) vrele(nd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs symbolic link service */ int nfsrv_symlink(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct vattr va, dirfor, diraft; struct nameidata nd; struct vattr *vap = &va; struct nfsv2_sattr *sp; char *bpos, *pathcp = NULL; struct uio io; struct iovec iv; int error = 0, len, len2, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *dirp = NULL; nfsfh_t nfh; fhandle_t *fhp; struct mount *mp = NULL; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto out; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | SAVESTART | MPSAFE; error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); if (error == 0) { VATTR_NULL(vap); if (v3) nfsm_srvsattr(vap); nfsm_srvpathsiz(len2); } if (dirp && !v3) { vrele(dirp); dirp = NULL; } if (error) goto out; MALLOC(pathcp, caddr_t, len2 + 1, M_TEMP, M_WAITOK); iv.iov_base = pathcp; iv.iov_len = len2; io.uio_resid = len2; io.uio_offset = 0; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; nfsm_mtouio(&io, len2); if (!v3) { sp = nfsm_dissect_nonblock(struct nfsv2_sattr *, NFSX_V2SATTR); vap->va_mode = nfstov_mode(sp->sa_mode); } *(pathcp + len2) = '\0'; if (nd.ni_vp) { error = EEXIST; goto out; } /* * issue symlink op. SAVESTART is set so the underlying path component * is only freed by the VOP if an error occurs. */ if (vap->va_mode == (mode_t)VNOVAL) vap->va_mode = 0; error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap, pathcp); if (error) NDFREE(&nd, NDF_ONLY_PNBUF); else vput(nd.ni_vp); nd.ni_vp = NULL; /* * releases directory prior to potential lookup op. */ vput(nd.ni_dvp); nd.ni_dvp = NULL; if (error == 0) { if (v3) { /* * Issue lookup. Leave SAVESTART set so we can easily free * the name buffer later on. * * since LOCKPARENT is not set, ni_dvp will be garbage on * return whether an error occurs or not. */ nd.ni_cnd.cn_nameiop = LOOKUP; nd.ni_cnd.cn_flags &= ~(LOCKPARENT | FOLLOW); nd.ni_cnd.cn_flags |= (NOFOLLOW | LOCKLEAF); nd.ni_cnd.cn_thread = td; nd.ni_cnd.cn_cred = cred; tvfslocked = VFS_LOCK_GIANT(nd.ni_startdir->v_mount); if (tvfslocked) nd.ni_cnd.cn_flags |= GIANTHELD; error = lookup(&nd); nd.ni_dvp = NULL; vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); nd.ni_cnd.cn_flags &= ~GIANTHELD; if (error == 0) { bzero((caddr_t)fhp, sizeof(nfh)); fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(nd.ni_vp, &fhp->fh_fid); if (!error) error = VOP_GETATTR(nd.ni_vp, vap, cred, td); vput(nd.ni_vp); nd.ni_vp = NULL; } } } out: /* * These releases aren't strictly required, does even doing them * make any sense? XXX can nfsm_reply() block? */ if (pathcp) { FREE(pathcp, M_TEMP); pathcp = NULL; } if (dirp) { - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } if (nd.ni_startdir) { vrele(nd.ni_startdir); nd.ni_startdir = NULL; } nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { if (!error) { nfsm_srvpostop_fh(fhp); nfsm_srvpostop_attr(0, vap); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } error = 0; /* fall through */ nfsmout: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vrele(nd.ni_vp); if (nd.ni_startdir) vrele(nd.ni_startdir); if (dirp) vrele(dirp); if (pathcp) FREE(pathcp, M_TEMP); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * nfs mkdir service */ int nfsrv_mkdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct vattr va, dirfor, diraft; struct vattr *vap = &va; struct nfs_fattr *fp; struct nameidata nd; u_int32_t *tl; caddr_t bpos; int error = 0, len, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *dirp = NULL; int vpexcl = 0; nfsfh_t nfh; fhandle_t *fhp; struct mount *mp = NULL; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto out; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | MPSAFE; error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); if (dirp && !v3) { vrele(dirp); dirp = NULL; } if (error) { nfsm_reply(NFSX_WCCDATA(v3)); if (v3) nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; goto nfsmout; } VATTR_NULL(vap); if (v3) { nfsm_srvsattr(vap); } else { tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED); vap->va_mode = nfstov_mode(*tl++); } /* * At this point nd.ni_dvp is referenced and exclusively locked and * nd.ni_vp, if it exists, is referenced but not locked. */ vap->va_type = VDIR; if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); error = EEXIST; goto out; } /* * Issue mkdir op. Since SAVESTART is not set, the pathname * component is freed by the VOP call. This will fill-in * nd.ni_vp, reference, and exclusively lock it. */ if (vap->va_mode == (mode_t)VNOVAL) vap->va_mode = 0; error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); NDFREE(&nd, NDF_ONLY_PNBUF); vpexcl = 1; vput(nd.ni_dvp); nd.ni_dvp = NULL; if (!error) { bzero((caddr_t)fhp, sizeof(nfh)); fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(nd.ni_vp, &fhp->fh_fid); if (!error) error = VOP_GETATTR(nd.ni_vp, vap, cred, td); } out: if (dirp) { if (dirp == nd.ni_dvp) { diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); } else { /* Release existing locks to prevent deadlock. */ if (nd.ni_dvp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp && vpexcl) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) { if (vpexcl) vput(nd.ni_vp); else vrele(nd.ni_vp); } nd.ni_dvp = NULL; nd.ni_vp = NULL; - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } } nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { if (!error) { nfsm_srvpostop_fh(fhp); nfsm_srvpostop_attr(0, vap); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } else if (!error) { /* v2 non-error case. */ nfsm_srvfhtom(fhp, v3); fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } error = 0; /* fall through */ nfsmout: if (nd.ni_dvp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp && vpexcl) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) { if (vpexcl) vput(nd.ni_vp); else vrele(nd.ni_vp); } if (dirp) vrele(dirp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * nfs rmdir service */ int nfsrv_rmdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; caddr_t bpos; int error = 0, len, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *vp, *dirp = NULL; struct vattr dirfor, diraft; nfsfh_t nfh; fhandle_t *fhp; struct nameidata nd; struct mount *mp = NULL; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto out; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | MPSAFE; error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, v3, &dirfor, &dirfor_ret, td, FALSE); vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd); if (dirp && !v3) { vrele(dirp); dirp = NULL; } if (error) { nfsm_reply(NFSX_WCCDATA(v3)); if (v3) nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; goto nfsmout; } vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) error = EBUSY; out: /* * Issue or abort op. Since SAVESTART is not set, path name * component is freed by the VOP after either. */ if (!error) error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); if (dirp) { if (dirp == nd.ni_dvp) diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); else { /* Release existing locks to prevent deadlock. */ if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); nd.ni_dvp = NULL; nd.ni_vp = NULL; - vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY); diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td); VOP_UNLOCK(dirp, 0, td); } } nfsm_reply(NFSX_WCCDATA(v3)); error = 0; if (v3) nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); /* fall through */ nfsmout: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); } if (nd.ni_vp) vput(nd.ni_vp); if (dirp) vrele(dirp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs readdir service * - mallocs what it thinks is enough to read * count rounded up to a multiple of NFS_DIRBLKSIZ <= NFS_MAXREADDIR * - calls VOP_READDIR() * - loops around building the reply * if the output generated exceeds count break out of loop * The nfsm_clget macro is used here so that the reply will be packed * tightly in mbuf clusters. * - it only knows that it has encountered eof when the VOP_READDIR() * reads nothing * - as such one readdir rpc will return eof false although you are there * and then the next will return eof * - it trims out records with d_fileno == 0 * this doesn't matter for Unix clients, but they might confuse clients * for other os'. * NB: It is tempting to set eof to true if the VOP_READDIR() reads less * than requested, but this may not apply to all filesystems. For * example, client NFS does not { although it is never remote mounted * anyhow } * The alternate call nfsrv_readdirplus() does lookups as well. * PS: The NFS protocol spec. does not clarify what the "count" byte * argument is a count of.. just name strings and file id's or the * entire reply rpc or ... * I tried just file name and id sizes and it confused the Sun client, * so I am using the full rpc size now. The "paranoia.." comment refers * to including the status longwords that are not a part of the dir. * "entry" structures, but are in the rpc. */ struct flrep { nfsuint64 fl_off; u_int32_t fl_postopok; u_int32_t fl_fattr[NFSX_V3FATTR / sizeof (u_int32_t)]; u_int32_t fl_fhok; u_int32_t fl_fhsize; u_int32_t fl_nfh[NFSX_V3FH / sizeof (u_int32_t)]; }; int nfsrv_readdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; char *bp, *be; struct mbuf *mp; struct dirent *dp; caddr_t cp; u_int32_t *tl; caddr_t bpos; struct mbuf *mb, *mreq; char *cpos, *cend, *rbuf; struct vnode *vp = NULL; struct vattr at; nfsfh_t nfh; fhandle_t *fhp; struct uio io; struct iovec iv; int len, nlen, rem, xfer, tsiz, i, error = 0, getret = 1; int siz, cnt, fullsiz, eofflag, rdonly, ncookies; int v3 = (nfsd->nd_flag & ND_NFSV3); u_quad_t off, toff, verf; u_long *cookies = NULL, *cookiep; /* needs to be int64_t or off_t */ int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if (v3) { tl = nfsm_dissect_nonblock(u_int32_t *, 5 * NFSX_UNSIGNED); toff = fxdr_hyper(tl); tl += 2; verf = fxdr_hyper(tl); tl += 2; } else { tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED); toff = fxdr_unsigned(u_quad_t, *tl++); verf = 0; /* shut up gcc */ } off = toff; cnt = fxdr_unsigned(int, *tl); siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); xfer = NFS_SRVMAXDATA(nfsd); if (cnt > xfer) cnt = xfer; if (siz > xfer) siz = xfer; fullsiz = siz; error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (!error && vp->v_type != VDIR) { error = ENOTDIR; vput(vp); vp = NULL; } if (error) { nfsm_reply(NFSX_UNSIGNED); if (v3) nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } /* * Obtain lock on vnode for this section of the code */ if (v3) { error = getret = VOP_GETATTR(vp, &at, cred, td); #if 0 /* * XXX This check may be too strict for Solaris 2.5 clients. */ if (!error && toff && verf && verf != at.va_filerev) error = NFSERR_BAD_COOKIE; #endif } if (!error) error = nfsrv_access(vp, VEXEC, cred, rdonly, td, 0); if (error) { vput(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(v3)); if (v3) nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } VOP_UNLOCK(vp, 0, td); /* * end section. Allocate rbuf and continue */ MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); again: iv.iov_base = rbuf; iv.iov_len = fullsiz; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_offset = (off_t)off; io.uio_resid = fullsiz; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; eofflag = 0; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (cookies) { free((caddr_t)cookies, M_TEMP); cookies = NULL; } error = VOP_READDIR(vp, &io, cred, &eofflag, &ncookies, &cookies); off = (off_t)io.uio_offset; if (!cookies && !error) error = NFSERR_PERM; if (v3) { getret = VOP_GETATTR(vp, &at, cred, td); if (!error) error = getret; } VOP_UNLOCK(vp, 0, td); if (error) { vrele(vp); vp = NULL; free((caddr_t)rbuf, M_TEMP); if (cookies) free((caddr_t)cookies, M_TEMP); nfsm_reply(NFSX_POSTOPATTR(v3)); if (v3) nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } if (io.uio_resid) { siz -= io.uio_resid; /* * If nothing read, return eof * rpc reply */ if (siz == 0) { vrele(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) + 2 * NFSX_UNSIGNED); if (v3) { nfsm_srvpostop_attr(getret, &at); tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED); txdr_hyper(at.va_filerev, tl); tl += 2; } else tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = nfsrv_nfs_false; *tl = nfsrv_nfs_true; FREE((caddr_t)rbuf, M_TEMP); FREE((caddr_t)cookies, M_TEMP); error = 0; goto nfsmout; } } /* * Check for degenerate cases of nothing useful read. * If so go try again */ cpos = rbuf; cend = rbuf + siz; dp = (struct dirent *)cpos; cookiep = cookies; /* * For some reason FreeBSD's ufs_readdir() chooses to back the * directory offset up to a block boundary, so it is necessary to * skip over the records that precede the requested offset. This * requires the assumption that file offset cookies monotonically * increase. */ while (cpos < cend && ncookies > 0 && (dp->d_fileno == 0 || dp->d_type == DT_WHT || ((u_quad_t)(*cookiep)) <= toff)) { cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos >= cend || ncookies == 0) { toff = off; siz = fullsiz; goto again; } len = 3 * NFSX_UNSIGNED; /* paranoia, probably can be 0 */ nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) + siz); if (v3) { nfsm_srvpostop_attr(getret, &at); tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED); txdr_hyper(at.va_filerev, tl); } mp = mb; bp = bpos; be = bp + M_TRAILINGSPACE(mp); /* Loop through the records and build reply */ while (cpos < cend && ncookies > 0) { if (dp->d_fileno != 0 && dp->d_type != DT_WHT) { nlen = dp->d_namlen; rem = nfsm_rndup(nlen) - nlen; len += (4 * NFSX_UNSIGNED + nlen + rem); if (v3) len += 2 * NFSX_UNSIGNED; if (len > cnt) { eofflag = 0; break; } /* * Build the directory record xdr from * the dirent entry. */ nfsm_clget; *tl = nfsrv_nfs_true; bp += NFSX_UNSIGNED; if (v3) { nfsm_clget; *tl = 0; bp += NFSX_UNSIGNED; } nfsm_clget; *tl = txdr_unsigned(dp->d_fileno); bp += NFSX_UNSIGNED; nfsm_clget; *tl = txdr_unsigned(nlen); bp += NFSX_UNSIGNED; /* And loop around copying the name */ xfer = nlen; cp = dp->d_name; while (xfer > 0) { nfsm_clget; if ((bp+xfer) > be) tsiz = be-bp; else tsiz = xfer; bcopy(cp, bp, tsiz); bp += tsiz; xfer -= tsiz; if (xfer > 0) cp += tsiz; } /* And null pad to an int32_t boundary. */ for (i = 0; i < rem; i++) *bp++ = '\0'; nfsm_clget; /* Finish off the record */ if (v3) { *tl = 0; bp += NFSX_UNSIGNED; nfsm_clget; } *tl = txdr_unsigned(*cookiep); bp += NFSX_UNSIGNED; } cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } vrele(vp); vp = NULL; nfsm_clget; *tl = nfsrv_nfs_false; bp += NFSX_UNSIGNED; nfsm_clget; if (eofflag) *tl = nfsrv_nfs_true; else *tl = nfsrv_nfs_false; bp += NFSX_UNSIGNED; if (mp != mb) { if (bp < be) mp->m_len = bp - mtod(mp, caddr_t); } else mp->m_len += bp - bpos; FREE((caddr_t)rbuf, M_TEMP); FREE((caddr_t)cookies, M_TEMP); nfsmout: if (vp) vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } int nfsrv_readdirplus(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; char *bp, *be; struct mbuf *mp; struct dirent *dp; caddr_t cp; u_int32_t *tl; caddr_t bpos; struct mbuf *mb, *mreq; char *cpos, *cend, *rbuf; struct vnode *vp = NULL, *nvp; struct flrep fl; nfsfh_t nfh; fhandle_t *fhp, *nfhp = (fhandle_t *)fl.fl_nfh; struct uio io; struct iovec iv; struct vattr va, at, *vap = &va; struct nfs_fattr *fp; int len, nlen, rem, xfer, tsiz, i, error = 0, getret = 1; int siz, cnt, fullsiz, eofflag, rdonly, dirlen, ncookies; u_quad_t off, toff, verf; u_long *cookies = NULL, *cookiep; /* needs to be int64_t or off_t */ int v3 = (nfsd->nd_flag & ND_NFSV3); int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; if (!v3) panic("nfsrv_readdirplus: v3 proc called on a v2 connection"); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); tl = nfsm_dissect_nonblock(u_int32_t *, 6 * NFSX_UNSIGNED); toff = fxdr_hyper(tl); tl += 2; verf = fxdr_hyper(tl); tl += 2; siz = fxdr_unsigned(int, *tl++); cnt = fxdr_unsigned(int, *tl); off = toff; siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); xfer = NFS_SRVMAXDATA(nfsd); if (cnt > xfer) cnt = xfer; if (siz > xfer) siz = xfer; fullsiz = siz; error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (!error && vp->v_type != VDIR) { error = ENOTDIR; vput(vp); vp = NULL; } if (error) { nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } error = getret = VOP_GETATTR(vp, &at, cred, td); #if 0 /* * XXX This check may be too strict for Solaris 2.5 clients. */ if (!error && toff && verf && verf != at.va_filerev) error = NFSERR_BAD_COOKIE; #endif if (!error) error = nfsrv_access(vp, VEXEC, cred, rdonly, td, 0); if (error) { vput(vp); vp = NULL; nfsm_reply(NFSX_V3POSTOPATTR); nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } VOP_UNLOCK(vp, 0, td); MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); again: iv.iov_base = rbuf; iv.iov_len = fullsiz; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_offset = (off_t)off; io.uio_resid = fullsiz; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; eofflag = 0; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (cookies) { free((caddr_t)cookies, M_TEMP); cookies = NULL; } error = VOP_READDIR(vp, &io, cred, &eofflag, &ncookies, &cookies); off = (u_quad_t)io.uio_offset; getret = VOP_GETATTR(vp, &at, cred, td); VOP_UNLOCK(vp, 0, td); if (!cookies && !error) error = NFSERR_PERM; if (!error) error = getret; if (error) { vrele(vp); vp = NULL; if (cookies) free((caddr_t)cookies, M_TEMP); free((caddr_t)rbuf, M_TEMP); nfsm_reply(NFSX_V3POSTOPATTR); nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } if (io.uio_resid) { siz -= io.uio_resid; /* * If nothing read, return eof * rpc reply */ if (siz == 0) { vrele(vp); vp = NULL; nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF + 2 * NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED); txdr_hyper(at.va_filerev, tl); tl += 2; *tl++ = nfsrv_nfs_false; *tl = nfsrv_nfs_true; FREE((caddr_t)cookies, M_TEMP); FREE((caddr_t)rbuf, M_TEMP); error = 0; goto nfsmout; } } /* * Check for degenerate cases of nothing useful read. * If so go try again */ cpos = rbuf; cend = rbuf + siz; dp = (struct dirent *)cpos; cookiep = cookies; /* * For some reason FreeBSD's ufs_readdir() chooses to back the * directory offset up to a block boundary, so it is necessary to * skip over the records that precede the requested offset. This * requires the assumption that file offset cookies monotonically * increase. */ while (cpos < cend && ncookies > 0 && (dp->d_fileno == 0 || dp->d_type == DT_WHT || ((u_quad_t)(*cookiep)) <= toff)) { cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos >= cend || ncookies == 0) { toff = off; siz = fullsiz; goto again; } /* * Probe one of the directory entries to see if the filesystem * supports VGET. */ if (VFS_VGET(vp->v_mount, dp->d_fileno, LK_EXCLUSIVE, &nvp) == EOPNOTSUPP) { error = NFSERR_NOTSUPP; vrele(vp); vp = NULL; free((caddr_t)cookies, M_TEMP); free((caddr_t)rbuf, M_TEMP); nfsm_reply(NFSX_V3POSTOPATTR); nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } vput(nvp); nvp = NULL; dirlen = len = NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF + 2 * NFSX_UNSIGNED; nfsm_reply(cnt); nfsm_srvpostop_attr(getret, &at); tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED); txdr_hyper(at.va_filerev, tl); mp = mb; bp = bpos; be = bp + M_TRAILINGSPACE(mp); /* Loop through the records and build reply */ while (cpos < cend && ncookies > 0) { if (dp->d_fileno != 0 && dp->d_type != DT_WHT) { nlen = dp->d_namlen; rem = nfsm_rndup(nlen)-nlen; /* * For readdir_and_lookup get the vnode using * the file number. */ if (VFS_VGET(vp->v_mount, dp->d_fileno, LK_EXCLUSIVE, &nvp)) goto invalid; bzero((caddr_t)nfhp, NFSX_V3FH); nfhp->fh_fsid = nvp->v_mount->mnt_stat.f_fsid; /* * XXXRW: Assert the mountpoints are the same so that * we know that acquiring Giant based on the * directory is the right thing for the child. */ KASSERT(nvp->v_mount == vp->v_mount, ("nfsrv_readdirplus: nvp mount != vp mount")); if (VOP_VPTOFH(nvp, &nfhp->fh_fid)) { vput(nvp); nvp = NULL; goto invalid; } if (VOP_GETATTR(nvp, vap, cred, td)) { vput(nvp); nvp = NULL; goto invalid; } vput(nvp); nvp = NULL; /* * If either the dircount or maxcount will be * exceeded, get out now. Both of these lengths * are calculated conservatively, including all * XDR overheads. */ len += (8 * NFSX_UNSIGNED + nlen + rem + NFSX_V3FH + NFSX_V3POSTOPATTR); dirlen += (6 * NFSX_UNSIGNED + nlen + rem); if (len > cnt || dirlen > fullsiz) { eofflag = 0; break; } /* * Build the directory record xdr from * the dirent entry. */ fp = (struct nfs_fattr *)&fl.fl_fattr; nfsm_srvfillattr(vap, fp); fl.fl_fhsize = txdr_unsigned(NFSX_V3FH); fl.fl_fhok = nfsrv_nfs_true; fl.fl_postopok = nfsrv_nfs_true; fl.fl_off.nfsuquad[0] = 0; fl.fl_off.nfsuquad[1] = txdr_unsigned(*cookiep); nfsm_clget; *tl = nfsrv_nfs_true; bp += NFSX_UNSIGNED; nfsm_clget; *tl = 0; bp += NFSX_UNSIGNED; nfsm_clget; *tl = txdr_unsigned(dp->d_fileno); bp += NFSX_UNSIGNED; nfsm_clget; *tl = txdr_unsigned(nlen); bp += NFSX_UNSIGNED; /* And loop around copying the name */ xfer = nlen; cp = dp->d_name; while (xfer > 0) { nfsm_clget; if ((bp + xfer) > be) tsiz = be - bp; else tsiz = xfer; bcopy(cp, bp, tsiz); bp += tsiz; xfer -= tsiz; if (xfer > 0) cp += tsiz; } /* And null pad to an int32_t boundary. */ for (i = 0; i < rem; i++) *bp++ = '\0'; /* * Now copy the flrep structure out. */ xfer = sizeof (struct flrep); cp = (caddr_t)&fl; while (xfer > 0) { nfsm_clget; if ((bp + xfer) > be) tsiz = be - bp; else tsiz = xfer; bcopy(cp, bp, tsiz); bp += tsiz; xfer -= tsiz; if (xfer > 0) cp += tsiz; } } invalid: cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } vrele(vp); vp = NULL; nfsm_clget; *tl = nfsrv_nfs_false; bp += NFSX_UNSIGNED; nfsm_clget; if (eofflag) *tl = nfsrv_nfs_true; else *tl = nfsrv_nfs_false; bp += NFSX_UNSIGNED; if (mp != mb) { if (bp < be) mp->m_len = bp - mtod(mp, caddr_t); } else mp->m_len += bp - bpos; FREE((caddr_t)cookies, M_TEMP); FREE((caddr_t)rbuf, M_TEMP); nfsmout: if (vp) vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs commit service */ int nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct vattr bfor, aft; struct vnode *vp = NULL; nfsfh_t nfh; fhandle_t *fhp; u_int32_t *tl; caddr_t bpos; int error = 0, rdonly, for_ret = 1, aft_ret = 1, cnt; struct mbuf *mb, *mreq; u_quad_t off; struct mount *mp = NULL; int v3 = (nfsd->nd_flag & ND_NFSV3); int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; if (!v3) panic("nfsrv_commit: v3 proc called on a v2 connection"); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) { error = ESTALE; goto ereply; } vfslocked = VFS_LOCK_GIANT(mp); (void) vn_start_write(NULL, &mp, V_WAIT); vfs_rel(mp); /* The write holds a ref. */ tl = nfsm_dissect_nonblock(u_int32_t *, 3 * NFSX_UNSIGNED); /* * XXX At this time VOP_FSYNC() does not accept offset and byte * count parameters, so these arguments are useless (someday maybe). */ off = fxdr_hyper(tl); tl += 2; cnt = fxdr_unsigned(int, *tl); error = nfsrv_fhtovp(fhp, 1, &vp, &tvfslocked, cred, slp, nam, &rdonly, TRUE); vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked); if (error) { nfsm_reply(2 * NFSX_UNSIGNED); nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft); error = 0; goto nfsmout; } for_ret = VOP_GETATTR(vp, &bfor, cred, td); if (cnt > MAX_COMMIT_COUNT) { /* * Give up and do the whole thing */ if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); } else { /* * Locate and synchronously write any buffers that fall * into the requested range. Note: we are assuming that * f_iosize is a power of 2. */ int iosize = vp->v_mount->mnt_stat.f_iosize; int iomask = iosize - 1; int s; daddr_t lblkno; /* * Align to iosize boundry, super-align to page boundry. */ if (off & iomask) { cnt += off & iomask; off &= ~(u_quad_t)iomask; } if (off & PAGE_MASK) { cnt += off & PAGE_MASK; off &= ~(u_quad_t)PAGE_MASK; } lblkno = off / iosize; if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, off / PAGE_SIZE, (cnt + PAGE_MASK) / PAGE_SIZE, OBJPC_SYNC); VM_OBJECT_UNLOCK(vp->v_object); } s = splbio(); VI_LOCK(vp); while (cnt > 0) { struct buf *bp; /* * If we have a buffer and it is marked B_DELWRI we * have to lock and write it. Otherwise the prior * write is assumed to have already been committed. * * gbincore() can return invalid buffers now so we * have to check that bit as well (though B_DELWRI * should not be set if B_INVAL is set there could be * a race here since we haven't locked the buffer). */ if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) { VI_LOCK(vp); continue; /* retry */ } if ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI) { bremfree(bp); bp->b_flags &= ~B_ASYNC; bwrite(bp); ++nfs_commit_miss; } else BUF_UNLOCK(bp); VI_LOCK(vp); } ++nfs_commit_blks; if (cnt < iosize) break; cnt -= iosize; ++lblkno; } VI_UNLOCK(vp); splx(s); } aft_ret = VOP_GETATTR(vp, &aft, cred, td); vput(vp); vp = NULL; ereply: nfsm_reply(NFSX_V3WCCDATA + NFSX_V3WRITEVERF); nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft); if (!error) { tl = nfsm_build(u_int32_t *, NFSX_V3WRITEVERF); if (nfsver.tv_sec == 0) nfsver = boottime; *tl++ = txdr_unsigned(nfsver.tv_sec); *tl = txdr_unsigned(nfsver.tv_usec); } else { error = 0; } nfsmout: if (vp) vput(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs statfs service */ int nfsrv_statfs(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct statfs *sf; struct nfs_statfs *sfp; caddr_t bpos; int error = 0, rdonly, getret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); struct mbuf *mb, *mreq; struct vnode *vp = NULL; struct vattr at; nfsfh_t nfh; fhandle_t *fhp; struct statfs statfs; u_quad_t tval; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); vfslocked = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { nfsm_reply(NFSX_UNSIGNED); if (v3) nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } sf = &statfs; error = VFS_STATFS(vp->v_mount, sf, td); getret = VOP_GETATTR(vp, &at, cred, td); vput(vp); vp = NULL; nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_STATFS(v3)); if (v3) nfsm_srvpostop_attr(getret, &at); if (error) { error = 0; goto nfsmout; } sfp = nfsm_build(struct nfs_statfs *, NFSX_STATFS(v3)); if (v3) { tval = (u_quad_t)sf->f_blocks; tval *= (u_quad_t)sf->f_bsize; txdr_hyper(tval, &sfp->sf_tbytes); tval = (u_quad_t)sf->f_bfree; tval *= (u_quad_t)sf->f_bsize; txdr_hyper(tval, &sfp->sf_fbytes); /* * Don't send negative values for available space, * since this field is unsigned in the NFS protocol. * Otherwise, the client would see absurdly high * numbers for free space. */ if (sf->f_bavail < 0) tval = 0; else tval = (u_quad_t)sf->f_bavail; tval *= (u_quad_t)sf->f_bsize; txdr_hyper(tval, &sfp->sf_abytes); sfp->sf_tfiles.nfsuquad[0] = 0; sfp->sf_tfiles.nfsuquad[1] = txdr_unsigned(sf->f_files); sfp->sf_ffiles.nfsuquad[0] = 0; sfp->sf_ffiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree); sfp->sf_afiles.nfsuquad[0] = 0; sfp->sf_afiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree); sfp->sf_invarsec = 0; } else { sfp->sf_tsize = txdr_unsigned(NFS_MAXDGRAMDATA); sfp->sf_bsize = txdr_unsigned(sf->f_bsize); sfp->sf_blocks = txdr_unsigned(sf->f_blocks); sfp->sf_bfree = txdr_unsigned(sf->f_bfree); if (sf->f_bavail < 0) sfp->sf_bavail = 0; else sfp->sf_bavail = txdr_unsigned(sf->f_bavail); } nfsmout: if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs fsinfo service */ int nfsrv_fsinfo(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nfsv3_fsinfo *sip; caddr_t bpos; int error = 0, rdonly, getret = 1, pref; struct mbuf *mb, *mreq; struct vnode *vp = NULL; struct vattr at; nfsfh_t nfh; fhandle_t *fhp; u_quad_t maxfsize; struct statfs sb; int v3 = (nfsd->nd_flag & ND_NFSV3); int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (!v3) panic("nfsrv_fsinfo: v3 proc called on a v2 connection"); fhp = &nfh.fh_generic; vfslocked = 0; nfsm_srvmtofh(fhp); error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } /* XXX Try to make a guess on the max file size. */ VFS_STATFS(vp->v_mount, &sb, td); maxfsize = (u_quad_t)0x80000000 * sb.f_bsize - 1; getret = VOP_GETATTR(vp, &at, cred, td); vput(vp); vp = NULL; nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3FSINFO); nfsm_srvpostop_attr(getret, &at); sip = nfsm_build(struct nfsv3_fsinfo *, NFSX_V3FSINFO); /* * XXX * There should be filesystem VFS OP(s) to get this information. * For now, assume ufs. */ if (slp->ns_so->so_type == SOCK_DGRAM) pref = NFS_MAXDGRAMDATA; else pref = NFS_MAXDATA; sip->fs_rtmax = txdr_unsigned(pref); sip->fs_rtpref = txdr_unsigned(pref); sip->fs_rtmult = txdr_unsigned(NFS_FABLKSIZE); sip->fs_wtmax = txdr_unsigned(pref); sip->fs_wtpref = txdr_unsigned(pref); sip->fs_wtmult = txdr_unsigned(NFS_FABLKSIZE); sip->fs_dtpref = txdr_unsigned(pref); txdr_hyper(maxfsize, &sip->fs_maxfilesize); sip->fs_timedelta.nfsv3_sec = 0; sip->fs_timedelta.nfsv3_nsec = txdr_unsigned(1); sip->fs_properties = txdr_unsigned(NFSV3FSINFO_LINK | NFSV3FSINFO_SYMLINK | NFSV3FSINFO_HOMOGENEOUS | NFSV3FSINFO_CANSETTIME); nfsmout: if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * nfs pathconf service */ int nfsrv_pathconf(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; struct sockaddr *nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; struct ucred *cred = nfsd->nd_cr; struct nfsv3_pathconf *pc; caddr_t bpos; int error = 0, rdonly, getret = 1; register_t linkmax, namemax, chownres, notrunc; struct mbuf *mb, *mreq; struct vnode *vp = NULL; struct vattr at; nfsfh_t nfh; fhandle_t *fhp; int v3 = (nfsd->nd_flag & ND_NFSV3); int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (!v3) panic("nfsrv_pathconf: v3 proc called on a v2 connection"); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam, &rdonly, TRUE); if (error) { nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); error = 0; goto nfsmout; } error = VOP_PATHCONF(vp, _PC_LINK_MAX, &linkmax); if (!error) error = VOP_PATHCONF(vp, _PC_NAME_MAX, &namemax); if (!error) error = VOP_PATHCONF(vp, _PC_CHOWN_RESTRICTED, &chownres); if (!error) error = VOP_PATHCONF(vp, _PC_NO_TRUNC, ¬runc); getret = VOP_GETATTR(vp, &at, cred, td); vput(vp); vp = NULL; nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3PATHCONF); nfsm_srvpostop_attr(getret, &at); if (error) { error = 0; goto nfsmout; } pc = nfsm_build(struct nfsv3_pathconf *, NFSX_V3PATHCONF); pc->pc_linkmax = txdr_unsigned(linkmax); pc->pc_namemax = txdr_unsigned(namemax); pc->pc_notrunc = txdr_unsigned(notrunc); pc->pc_chownrestricted = txdr_unsigned(chownres); /* * These should probably be supported by VOP_PATHCONF(), but * until msdosfs is exportable (why would you want to?), the * Unix defaults should be ok. */ pc->pc_caseinsensitive = nfsrv_nfs_false; pc->pc_casepreserving = nfsrv_nfs_true; nfsmout: if (vp) vput(vp); VFS_UNLOCK_GIANT(vfslocked); return(error); } /* * Null operation, used by clients to ping server */ /* ARGSUSED */ int nfsrv_null(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep; caddr_t bpos; int error = NFSERR_RETVOID; struct mbuf *mb, *mreq; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); nfsm_reply(0); nfsmout: return (error); } /* * No operation, used for obsolete procedures */ /* ARGSUSED */ int nfsrv_noop(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct thread *td, struct mbuf **mrq) { struct mbuf *mrep = nfsd->nd_mrep; caddr_t bpos; int error; struct mbuf *mb, *mreq; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (nfsd->nd_repstat) error = nfsd->nd_repstat; else error = EPROCUNAVAIL; nfsm_reply(0); error = 0; nfsmout: return (error); } /* * Perform access checking for vnodes obtained from file handles that would * refer to files already opened by a Unix client. You cannot just use * vn_writechk() and VOP_ACCESS() for two reasons. * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write * case. * 2 - The owner is to be given access irrespective of mode bits for some * operations, so that processes that chmod after opening a file don't * break. I don't like this because it opens a security hole, but since * the nfs server opens a security hole the size of a barn door anyhow, * what the heck. * * The exception to rule 2 is EPERM. If a file is IMMUTABLE, VOP_ACCESS() * will return EPERM instead of EACCESS. EPERM is always an error. */ static int nfsrv_access(struct vnode *vp, int flags, struct ucred *cred, int rdonly, struct thread *td, int override) { struct vattr vattr; int error; VFS_ASSERT_GIANT(vp->v_mount); nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (flags & VWRITE) { /* Just vn_writechk() changed to check rdonly */ /* * Disallow write attempts on read-only filesystems; * unless the file is a socket or a block or character * device resident on the filesystem. */ if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * If there's shared text associated with * the inode, we can't allow writing. */ if (vp->v_vflag & VV_TEXT) return (ETXTBSY); } error = VOP_GETATTR(vp, &vattr, cred, td); if (error) return (error); error = VOP_ACCESS(vp, flags, cred, td); /* * Allow certain operations for the owner (reads and writes * on files that are already open). */ if (override && error == EACCES && cred->cr_uid == vattr.va_uid) error = 0; return (error); } Index: head/sys/nfsserver/nfs_srvsubs.c =================================================================== --- head/sys/nfsserver/nfs_srvsubs.c (revision 175201) +++ head/sys/nfsserver/nfs_srvsubs.c (revision 175202) @@ -1,1472 +1,1472 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfsrv_nfs_xdrneg1; u_int32_t nfsrv_rpc_call, nfsrv_rpc_vers, nfsrv_rpc_reply, nfsrv_rpc_msgdenied, nfsrv_rpc_autherr, nfsrv_rpc_mismatch, nfsrv_rpc_auth_unix, nfsrv_rpc_msgaccepted; u_int32_t nfsrv_nfs_prog, nfsrv_nfs_true, nfsrv_nfs_false; /* And other global data */ static const nfstype nfsv2_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFNON, NFCHR, NFNON }; #define vtonfsv2_type(a) txdr_unsigned(nfsv2_type[((int32_t)(a))]) #define vtonfsv3_mode(m) txdr_unsigned((m) & ALLPERMS) int nfsrv_ticks; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; static int nfssvc_offset = SYS_nfssvc; static struct sysent nfssvc_prev_sysent; MAKE_SYSENT(nfssvc); struct mtx nfsd_mtx; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ const int nfsrv_nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, }; /* * and the reverse mapping from generic to Version 2 procedure numbers */ const int nfsrvv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; /* * Maps errno values to nfs error numbers. * Use 0 (which gets converted to NFSERR_IO) as the catch all for ones not * specifically defined in RFC 1094. */ static const u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, 0, 0, 0, NFSERR_NXIO, 0, 0, 0, 0, 0, 0, NFSERR_ACCES, 0, 0, 0, NFSERR_EXIST, 0, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, 0, 0, 0, 0, 0, NFSERR_FBIG, NFSERR_NOSPC, 0, NFSERR_ROFS, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NFSERR_NAMETOL, 0, 0, NFSERR_NOTEMPTY, 0, 0, NFSERR_DQUOT, NFSERR_STALE, 0 }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static const short nfsv3err_null[] = { 0, 0, }; static const short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static const short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; /* * Called once to initialize data structures... */ static int nfsrv_modevent(module_t mod, int type, void *data) { static int registered; int error = 0; switch (type) { case MOD_LOAD: mtx_init(&nfsd_mtx, "nfsd_mtx", NULL, MTX_DEF); nfsrv_rpc_vers = txdr_unsigned(RPC_VER2); nfsrv_rpc_call = txdr_unsigned(RPC_CALL); nfsrv_rpc_reply = txdr_unsigned(RPC_REPLY); nfsrv_rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); nfsrv_rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); nfsrv_rpc_mismatch = txdr_unsigned(RPC_MISMATCH); nfsrv_rpc_autherr = txdr_unsigned(RPC_AUTHERR); nfsrv_rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); nfsrv_nfs_prog = txdr_unsigned(NFS_PROG); nfsrv_nfs_true = txdr_unsigned(TRUE); nfsrv_nfs_false = txdr_unsigned(FALSE); nfsrv_nfs_xdrneg1 = txdr_unsigned(-1); nfsrv_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfsrv_ticks < 1) nfsrv_ticks = 1; nfsrv_initcache(); /* Init the server request cache */ NFSD_LOCK(); nfsrv_init(0); /* Init server data structures */ callout_init(&nfsrv_callout, CALLOUT_MPSAFE); NFSD_UNLOCK(); nfsrv_timer(0); error = syscall_register(&nfssvc_offset, &nfssvc_sysent, &nfssvc_prev_sysent); if (error) break; registered = 1; break; case MOD_UNLOAD: if (nfsrv_numnfsd != 0) { error = EBUSY; break; } if (registered) syscall_deregister(&nfssvc_offset, &nfssvc_prev_sysent); callout_drain(&nfsrv_callout); nfsrv_destroycache(); /* Free the server request cache */ nfsrv_destroycache(); /* Free the server request cache */ mtx_destroy(&nfsd_mtx); break; default: error = EOPNOTSUPP; break; } return error; } static moduledata_t nfsserver_mod = { "nfsserver", nfsrv_modevent, NULL, }; DECLARE_MODULE(nfsserver, nfsserver_mod, SI_SUB_VFS, SI_ORDER_ANY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfsserver, 1); /* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(struct nameidata *ndp, fhandle_t *fhp, int len, struct nfssvc_sock *slp, struct sockaddr *nam, struct mbuf **mdp, caddr_t *dposp, struct vnode **retdirp, int v3, struct vattr *retdirattrp, int *retdirattr_retp, struct thread *td, int pubflag) { int i, rem; struct mbuf *md; char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0; int dvfslocked; int vfslocked; vfslocked = 0; dvfslocked = 0; *retdirp = NULL; cnp->cn_flags |= NOMACCHECK; cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, &dvfslocked, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, pubflag); if (error) goto out; vfslocked = VFS_LOCK_GIANT(dp->v_mount); if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (v3) { - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); *retdirattr_retp = VOP_GETATTR(dp, retdirattrp, ndp->ni_cnd.cn_cred, td); VOP_UNLOCK(dp, 0, td); } if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = uma_zalloc(namei_zone, M_WAITOK); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; uma_zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; uma_zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(rootvnode->v_mount); VFS_UNLOCK_GIANT(vfslocked); dp = rootvnode; vfslocked = tvfslocked; } } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * because lookup() will dereference ni_startdir. */ cnp->cn_thread = td; VREF(dp); ndp->ni_startdir = dp; if (!lockleaf) cnp->cn_flags |= LOCKLEAF; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ if (vfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; error = lookup(ndp); vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; ndp->ni_cnd.cn_flags &= ~GIANTHELD; if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else uma_zfree(namei_zone, cnp->cn_pnbuf); if (ndp->ni_vp && !lockleaf) VOP_UNLOCK(ndp->ni_vp, 0, td); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0, td); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = NULL; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); badlink2: vput(ndp->ni_vp); vrele(ndp->ni_dvp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } if (!lockleaf) cnp->cn_flags &= ~LOCKLEAF; if (cnp->cn_flags & GIANTHELD) { mtx_unlock(&Giant); cnp->cn_flags &= ~GIANTHELD; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { uma_zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } /* * This differs from normal namei() in that even on failure we may * return with Giant held due to the dirp return. Make sure we only * have not recursed however. The calling code only expects to drop * one acquire. */ if (vfslocked || dvfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; if (vfslocked && dvfslocked) VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(struct mbuf *mp, int len, int nul) { struct mbuf *m; int count, i; char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == NULL) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(struct nfsrv_descript *nfsd, int before_ret, struct vattr *before_vap, int after_ret, struct vattr *after_vap, struct mbuf **mbp, char **bposp) { struct mbuf *mb = *mbp; char *bpos = *bposp; u_int32_t *tl; if (before_ret) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = nfsrv_nfs_false; } else { tl = nfsm_build(u_int32_t *, 7 * NFSX_UNSIGNED); *tl++ = nfsrv_nfs_true; txdr_hyper(before_vap->va_size, tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(struct nfsrv_descript *nfsd, int after_ret, struct vattr *after_vap, struct mbuf **mbp, char **bposp) { struct mbuf *mb = *mbp; char *bpos = *bposp; u_int32_t *tl; struct nfs_fattr *fp; if (after_ret) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = nfsrv_nfs_false; } else { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfsrv_nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(struct nfsrv_descript *nfsd, struct vattr *vap, struct nfs_fattr *fp) { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(vap->va_size, &fp->fa3_size); txdr_hyper(vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhandle_t *fhp, int lockflag, struct vnode **vpp, int *vfslockedp, struct ucred *cred, struct nfssvc_sock *slp, struct sockaddr *nam, int *rdonlyp, int pubflag) { struct thread *td = curthread; /* XXX */ struct mount *mp; int i; struct ucred *credanon; int error, exflags; #ifdef MNT_EXNORESPORT /* XXX needs mountd and /etc/exports help yet */ struct sockaddr_int *saddr; #endif int vfslocked; *vfslockedp = 0; *vpp = NULL; if (nfs_ispublicfh(fhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); fhp = &nfs_pub.np_handle; } mp = vfs_getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); error = VFS_CHECKEXP(mp, nam, &exflags, &credanon); if (error) goto out; error = VFS_FHTOVP(mp, &fhp->fh_fid, vpp); if (error) goto out; #ifdef MNT_EXNORESPORT if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) { saddr = (struct sockaddr_in *)nam; if ((saddr->sin_family == AF_INET || saddr->sin_family == AF_INET6) && /* same code for INET and INET6: sin*_port at same offet */ ntohs(saddr->sin_port) >= IPPORT_RESERVED) { vput(*vpp); *vpp = NULL; error = NFSERR_AUTHERR | AUTH_TOOWEAK; } } #endif /* * Check/setup credentials. */ if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; if (!lockflag) VOP_UNLOCK(*vpp, 0, td); out: vfs_rel(mp); if (error) { VFS_UNLOCK_GIANT(vfslocked); } else *vfslockedp = vfslocked; return (error); } /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has * transformed this to all zeroes in both cases, so check for it. */ int nfs_ispublicfh(fhandle_t *fhp) { char *cp = (char *)fhp; int i; NFSD_LOCK_DONTCARE(); for (i = 0; i < NFSX_V3FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(int family, union nethostaddr *haddr, struct sockaddr *nam) { struct sockaddr_in *inetaddr; NFSD_LOCK_DONTCARE(); switch (family) { case AF_INET: inetaddr = (struct sockaddr_in *)nam; if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef INET6 case AF_INET6: { register struct sockaddr_in6 *inet6addr1, *inet6addr2; inet6addr1 = (struct sockaddr_in6 *)nam; inet6addr2 = (struct sockaddr_in6 *)haddr->had_nam; /* XXX - should test sin6_scope_id ? */ if (inet6addr1->sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&inet6addr1->sin6_addr, &inet6addr2->sin6_addr)) return (1); break; } #endif default: break; }; return (0); } /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(struct nfsrv_descript *nd, int err) { const short *defaulterrp, *errp; int e; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } e = 0; if (err <= ELAST) e = nfsrv_v2errmap[err - 1]; if (e != 0) return (e); return (NFSERR_IO); } /* * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) */ void nfsrvw_sort(gid_t *list, int num) { int i, j; gid_t v; /* Insertion sort. */ for (i = 1; i < num; i++) { v = list[i]; /* find correct slot for value v, moving others up */ for (j = i; --j >= 0 && v < list[j];) list[j + 1] = list[j]; list[j + 1] = v; } } /* * Helper functions for macros. */ void nfsm_srvfhtom_xx(fhandle_t *f, int v3, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; if (v3) { tl = nfsm_build_xx(NFSX_UNSIGNED + NFSX_V3FH, mb, bpos); *tl++ = txdr_unsigned(NFSX_V3FH); bcopy(f, tl, NFSX_V3FH); } else { tl = nfsm_build_xx(NFSX_V2FH, mb, bpos); bcopy(f, tl, NFSX_V2FH); } } void nfsm_srvpostop_fh_xx(fhandle_t *f, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; tl = nfsm_build_xx(2 * NFSX_UNSIGNED + NFSX_V3FH, mb, bpos); *tl++ = nfsrv_nfs_true; *tl++ = txdr_unsigned(NFSX_V3FH); bcopy(f, tl, NFSX_V3FH); } int nfsm_srvstrsiz_xx(int *s, int m, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int32_t, *tl); if (*s > m || *s <= 0) return EBADRPC; return 0; } int nfsm_srvnamesiz_xx(int *s, int m, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; NFSD_LOCK_DONTCARE(); tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int32_t, *tl); if (*s > m) return NFSERR_NAMETOL; if (*s <= 0) return EBADRPC; return 0; } int nfsm_srvnamesiz0_xx(int *s, int m, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int32_t, *tl); if (*s > m) return NFSERR_NAMETOL; if (*s < 0) return EBADRPC; return 0; } void nfsm_clget_xx(u_int32_t **tl, struct mbuf *mb, struct mbuf **mp, char **bp, char **be, caddr_t bpos) { struct mbuf *nmp; NFSD_UNLOCK_ASSERT(); if (*bp >= *be) { if (*mp == mb) (*mp)->m_len += *bp - bpos; MGET(nmp, M_TRYWAIT, MT_DATA); MCLGET(nmp, M_TRYWAIT); nmp->m_len = NFSMSIZ(nmp); (*mp)->m_next = nmp; *mp = nmp; *bp = mtod(*mp, caddr_t); *be = *bp + (*mp)->m_len; } *tl = (u_int32_t *)*bp; } int nfsm_srvmtofh_xx(fhandle_t *f, struct nfsrv_descript *nfsd, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; int fhlen; if (nfsd->nd_flag & ND_NFSV3) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; fhlen = fxdr_unsigned(int, *tl); if (fhlen != 0 && fhlen != NFSX_V3FH) return EBADRPC; } else { fhlen = NFSX_V2FH; } if (fhlen != 0) { tl = nfsm_dissect_xx_nonblock(fhlen, md, dpos); if (tl == NULL) return EBADRPC; bcopy((caddr_t)tl, (caddr_t)(f), fhlen); } else { bzero((caddr_t)(f), NFSX_V3FH); } return 0; } int nfsm_srvsattr_xx(struct vattr *a, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; int toclient = 0; tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_mode = nfstov_mode(*tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_uid = fxdr_unsigned(uid_t, *tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_gid = fxdr_unsigned(gid_t, *tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_size = fxdr_hyper(tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; fxdr_nfsv3time(tl, &(a)->va_atime); toclient = 1; break; case NFSV3SATTRTIME_TOSERVER: getnanotime(&(a)->va_atime); a->va_vaflags |= VA_UTIMES_NULL; break; } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; fxdr_nfsv3time(tl, &(a)->va_mtime); a->va_vaflags &= ~VA_UTIMES_NULL; break; case NFSV3SATTRTIME_TOSERVER: getnanotime(&(a)->va_mtime); if (toclient == 0) a->va_vaflags |= VA_UTIMES_NULL; break; } return 0; } Index: head/sys/security/audit/audit_arg.c =================================================================== --- head/sys/security/audit/audit_arg.c (revision 175201) +++ head/sys/security/audit/audit_arg.c (revision 175202) @@ -1,856 +1,856 @@ /* * Copyright (c) 1999-2005 Apple Computer, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Calls to manipulate elements of the audit record structure from system * call code. Macro wrappers will prevent this functions from being entered * if auditing is disabled, avoiding the function call cost. We check the * thread audit record pointer anyway, as the audit condition could change, * and pre-selection may not have allocated an audit record for this event. * * XXXAUDIT: Should we assert, in each case, that this field of the record * hasn't already been filled in? */ void audit_arg_addr(void *addr) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_addr = addr; ARG_SET_VALID(ar, ARG_ADDR); } void audit_arg_exit(int status, int retval) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_exitstatus = status; ar->k_ar.ar_arg_exitretval = retval; ARG_SET_VALID(ar, ARG_EXIT); } void audit_arg_len(int len) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_len = len; ARG_SET_VALID(ar, ARG_LEN); } void audit_arg_fd(int fd) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_fd = fd; ARG_SET_VALID(ar, ARG_FD); } void audit_arg_fflags(int fflags) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_fflags = fflags; ARG_SET_VALID(ar, ARG_FFLAGS); } void audit_arg_gid(gid_t gid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_gid = gid; ARG_SET_VALID(ar, ARG_GID); } void audit_arg_uid(uid_t uid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_uid = uid; ARG_SET_VALID(ar, ARG_UID); } void audit_arg_egid(gid_t egid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_egid = egid; ARG_SET_VALID(ar, ARG_EGID); } void audit_arg_euid(uid_t euid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_euid = euid; ARG_SET_VALID(ar, ARG_EUID); } void audit_arg_rgid(gid_t rgid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_rgid = rgid; ARG_SET_VALID(ar, ARG_RGID); } void audit_arg_ruid(uid_t ruid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_ruid = ruid; ARG_SET_VALID(ar, ARG_RUID); } void audit_arg_sgid(gid_t sgid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_sgid = sgid; ARG_SET_VALID(ar, ARG_SGID); } void audit_arg_suid(uid_t suid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_suid = suid; ARG_SET_VALID(ar, ARG_SUID); } void audit_arg_groupset(gid_t *gidset, u_int gidset_size) { int i; struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; for (i = 0; i < gidset_size; i++) ar->k_ar.ar_arg_groups.gidset[i] = gidset[i]; ar->k_ar.ar_arg_groups.gidset_size = gidset_size; ARG_SET_VALID(ar, ARG_GROUPSET); } void audit_arg_login(char *login) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; strlcpy(ar->k_ar.ar_arg_login, login, MAXLOGNAME); ARG_SET_VALID(ar, ARG_LOGIN); } void audit_arg_ctlname(int *name, int namelen) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; bcopy(name, &ar->k_ar.ar_arg_ctlname, namelen * sizeof(int)); ar->k_ar.ar_arg_len = namelen; ARG_SET_VALID(ar, ARG_CTLNAME | ARG_LEN); } void audit_arg_mask(int mask) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_mask = mask; ARG_SET_VALID(ar, ARG_MASK); } void audit_arg_mode(mode_t mode) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_mode = mode; ARG_SET_VALID(ar, ARG_MODE); } void audit_arg_dev(int dev) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_dev = dev; ARG_SET_VALID(ar, ARG_DEV); } void audit_arg_value(long value) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_value = value; ARG_SET_VALID(ar, ARG_VALUE); } void audit_arg_owner(uid_t uid, gid_t gid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_uid = uid; ar->k_ar.ar_arg_gid = gid; ARG_SET_VALID(ar, ARG_UID | ARG_GID); } void audit_arg_pid(pid_t pid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_pid = pid; ARG_SET_VALID(ar, ARG_PID); } void audit_arg_process(struct proc *p) { struct kaudit_record *ar; KASSERT(p != NULL, ("audit_arg_process: p == NULL")); PROC_LOCK_ASSERT(p, MA_OWNED); ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = p->p_ucred->cr_audit.ai_auid; ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid; ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0]; ar->k_ar.ar_arg_ruid = p->p_ucred->cr_ruid; ar->k_ar.ar_arg_rgid = p->p_ucred->cr_rgid; ar->k_ar.ar_arg_asid = p->p_ucred->cr_audit.ai_asid; ar->k_ar.ar_arg_termid_addr = p->p_ucred->cr_audit.ai_termid; ar->k_ar.ar_arg_pid = p->p_pid; ARG_SET_VALID(ar, ARG_AUID | ARG_EUID | ARG_EGID | ARG_RUID | ARG_RGID | ARG_ASID | ARG_TERMID_ADDR | ARG_PID | ARG_PROCESS); } void audit_arg_signum(u_int signum) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_signum = signum; ARG_SET_VALID(ar, ARG_SIGNUM); } void audit_arg_socket(int sodomain, int sotype, int soprotocol) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_sockinfo.so_domain = sodomain; ar->k_ar.ar_arg_sockinfo.so_type = sotype; ar->k_ar.ar_arg_sockinfo.so_protocol = soprotocol; ARG_SET_VALID(ar, ARG_SOCKINFO); } void audit_arg_sockaddr(struct thread *td, struct sockaddr *sa) { struct kaudit_record *ar; KASSERT(td != NULL, ("audit_arg_sockaddr: td == NULL")); KASSERT(sa != NULL, ("audit_arg_sockaddr: sa == NULL")); ar = currecord(); if (ar == NULL) return; bcopy(sa, &ar->k_ar.ar_arg_sockaddr, sa->sa_len); switch (sa->sa_family) { case AF_INET: ARG_SET_VALID(ar, ARG_SADDRINET); break; case AF_INET6: ARG_SET_VALID(ar, ARG_SADDRINET6); break; case AF_UNIX: audit_arg_upath(td, ((struct sockaddr_un *)sa)->sun_path, ARG_UPATH1); ARG_SET_VALID(ar, ARG_SADDRUNIX); break; /* XXXAUDIT: default:? */ } } void audit_arg_auid(uid_t auid) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = auid; ARG_SET_VALID(ar, ARG_AUID); } void audit_arg_auditinfo(struct auditinfo *au_info) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = au_info->ai_auid; ar->k_ar.ar_arg_asid = au_info->ai_asid; ar->k_ar.ar_arg_amask.am_success = au_info->ai_mask.am_success; ar->k_ar.ar_arg_amask.am_failure = au_info->ai_mask.am_failure; ar->k_ar.ar_arg_termid.port = au_info->ai_termid.port; ar->k_ar.ar_arg_termid.machine = au_info->ai_termid.machine; ARG_SET_VALID(ar, ARG_AUID | ARG_ASID | ARG_AMASK | ARG_TERMID); } void audit_arg_auditinfo_addr(struct auditinfo_addr *au_info) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_auid = au_info->ai_auid; ar->k_ar.ar_arg_asid = au_info->ai_asid; ar->k_ar.ar_arg_amask.am_success = au_info->ai_mask.am_success; ar->k_ar.ar_arg_amask.am_failure = au_info->ai_mask.am_failure; ar->k_ar.ar_arg_termid_addr.at_type = au_info->ai_termid.at_type; ar->k_ar.ar_arg_termid_addr.at_port = au_info->ai_termid.at_port; ar->k_ar.ar_arg_termid_addr.at_addr[0] = au_info->ai_termid.at_addr[0]; ar->k_ar.ar_arg_termid_addr.at_addr[1] = au_info->ai_termid.at_addr[1]; ar->k_ar.ar_arg_termid_addr.at_addr[2] = au_info->ai_termid.at_addr[2]; ar->k_ar.ar_arg_termid_addr.at_addr[3] = au_info->ai_termid.at_addr[3]; ARG_SET_VALID(ar, ARG_AUID | ARG_ASID | ARG_AMASK | ARG_TERMID_ADDR); } void audit_arg_text(char *text) { struct kaudit_record *ar; KASSERT(text != NULL, ("audit_arg_text: text == NULL")); ar = currecord(); if (ar == NULL) return; /* Invalidate the text string */ ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_TEXT); if (ar->k_ar.ar_arg_text == NULL) ar->k_ar.ar_arg_text = malloc(MAXPATHLEN, M_AUDITTEXT, M_WAITOK); strncpy(ar->k_ar.ar_arg_text, text, MAXPATHLEN); ARG_SET_VALID(ar, ARG_TEXT); } void audit_arg_cmd(int cmd) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_cmd = cmd; ARG_SET_VALID(ar, ARG_CMD); } void audit_arg_svipc_cmd(int cmd) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_svipc_cmd = cmd; ARG_SET_VALID(ar, ARG_SVIPC_CMD); } void audit_arg_svipc_perm(struct ipc_perm *perm) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; bcopy(perm, &ar->k_ar.ar_arg_svipc_perm, sizeof(ar->k_ar.ar_arg_svipc_perm)); ARG_SET_VALID(ar, ARG_SVIPC_PERM); } void audit_arg_svipc_id(int id) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_svipc_id = id; ARG_SET_VALID(ar, ARG_SVIPC_ID); } void audit_arg_svipc_addr(void * addr) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_svipc_addr = addr; ARG_SET_VALID(ar, ARG_SVIPC_ADDR); } void audit_arg_posix_ipc_perm(uid_t uid, gid_t gid, mode_t mode) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_pipc_perm.pipc_uid = uid; ar->k_ar.ar_arg_pipc_perm.pipc_gid = gid; ar->k_ar.ar_arg_pipc_perm.pipc_mode = mode; ARG_SET_VALID(ar, ARG_POSIX_IPC_PERM); } void audit_arg_auditon(union auditon_udata *udata) { struct kaudit_record *ar; ar = currecord(); if (ar == NULL) return; bcopy((void *)udata, &ar->k_ar.ar_arg_auditon, sizeof(ar->k_ar.ar_arg_auditon)); ARG_SET_VALID(ar, ARG_AUDITON); } /* * Audit information about a file, either the file's vnode info, or its * socket address info. */ void audit_arg_file(struct proc *p, struct file *fp) { struct kaudit_record *ar; struct socket *so; struct inpcb *pcb; struct vnode *vp; int vfslocked; ar = currecord(); if (ar == NULL) return; switch (fp->f_type) { case DTYPE_VNODE: case DTYPE_FIFO: /* * XXXAUDIT: Only possibly to record as first vnode? */ vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); audit_arg_vnode(vp, ARG_VNODE1); VOP_UNLOCK(vp, 0, curthread); VFS_UNLOCK_GIANT(vfslocked); break; case DTYPE_SOCKET: so = (struct socket *)fp->f_data; if (INP_CHECK_SOCKAF(so, PF_INET)) { SOCK_LOCK(so); ar->k_ar.ar_arg_sockinfo.so_type = so->so_type; ar->k_ar.ar_arg_sockinfo.so_domain = INP_SOCKAF(so); ar->k_ar.ar_arg_sockinfo.so_protocol = so->so_proto->pr_protocol; SOCK_UNLOCK(so); pcb = (struct inpcb *)so->so_pcb; INP_LOCK(pcb); ar->k_ar.ar_arg_sockinfo.so_raddr = pcb->inp_faddr.s_addr; ar->k_ar.ar_arg_sockinfo.so_laddr = pcb->inp_laddr.s_addr; ar->k_ar.ar_arg_sockinfo.so_rport = pcb->inp_fport; ar->k_ar.ar_arg_sockinfo.so_lport = pcb->inp_lport; INP_UNLOCK(pcb); ARG_SET_VALID(ar, ARG_SOCKINFO); } break; default: /* XXXAUDIT: else? */ break; } } /* * Store a path as given by the user process for auditing into the audit * record stored on the user thread. This function will allocate the memory * to store the path info if not already available. This memory will be freed * when the audit record is freed. * * XXXAUDIT: Possibly assert that the memory isn't already allocated? */ void audit_arg_upath(struct thread *td, char *upath, u_int64_t flag) { struct kaudit_record *ar; char **pathp; KASSERT(td != NULL, ("audit_arg_upath: td == NULL")); KASSERT(upath != NULL, ("audit_arg_upath: upath == NULL")); ar = currecord(); if (ar == NULL) return; KASSERT((flag == ARG_UPATH1) || (flag == ARG_UPATH2), ("audit_arg_upath: flag %llu", (unsigned long long)flag)); KASSERT((flag != ARG_UPATH1) || (flag != ARG_UPATH2), ("audit_arg_upath: flag %llu", (unsigned long long)flag)); if (flag == ARG_UPATH1) pathp = &ar->k_ar.ar_arg_upath1; else pathp = &ar->k_ar.ar_arg_upath2; if (*pathp == NULL) *pathp = malloc(MAXPATHLEN, M_AUDITPATH, M_WAITOK); canon_path(td, upath, *pathp); ARG_SET_VALID(ar, flag); } /* * Function to save the path and vnode attr information into the audit * record. * * It is assumed that the caller will hold any vnode locks necessary to * perform a VOP_GETATTR() on the passed vnode. * * XXX: The attr code is very similar to vfs_vnops.c:vn_stat(), but always * provides access to the generation number as we need that to construct the * BSM file ID. * * XXX: We should accept the process argument from the caller, since it's * very likely they already have a reference. * * XXX: Error handling in this function is poor. * * XXXAUDIT: Possibly KASSERT the path pointer is NULL? */ void audit_arg_vnode(struct vnode *vp, u_int64_t flags) { struct kaudit_record *ar; struct vattr vattr; int error; struct vnode_au_info *vnp; KASSERT(vp != NULL, ("audit_arg_vnode: vp == NULL")); KASSERT((flags == ARG_VNODE1) || (flags == ARG_VNODE2), ("audit_arg_vnode: flags %jd", (intmax_t)flags)); /* * Assume that if the caller is calling audit_arg_vnode() on a * non-MPSAFE vnode, then it will have acquired Giant. */ VFS_ASSERT_GIANT(vp->v_mount); ASSERT_VOP_LOCKED(vp, "audit_arg_vnode"); ar = currecord(); if (ar == NULL) return; /* * XXXAUDIT: The below clears, and then resets the flags for valid * arguments. Ideally, either the new vnode is used, or the old one * would be. */ if (flags & ARG_VNODE1) { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1); vnp = &ar->k_ar.ar_arg_vnode1; } else { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2); vnp = &ar->k_ar.ar_arg_vnode2; } error = VOP_GETATTR(vp, &vattr, curthread->td_ucred, curthread); if (error) { /* XXX: How to handle this case? */ return; } vnp->vn_mode = vattr.va_mode; vnp->vn_uid = vattr.va_uid; vnp->vn_gid = vattr.va_gid; vnp->vn_dev = vattr.va_rdev; vnp->vn_fsid = vattr.va_fsid; vnp->vn_fileid = vattr.va_fileid; vnp->vn_gen = vattr.va_gen; if (flags & ARG_VNODE1) ARG_SET_VALID(ar, ARG_VNODE1); else ARG_SET_VALID(ar, ARG_VNODE2); } /* * Audit the argument strings passed to exec. */ void audit_arg_argv(char *argv, int argc, int length) { struct kaudit_record *ar; if (audit_argv == 0) return; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_argv = malloc(length, M_AUDITTEXT, M_WAITOK); bcopy(argv, ar->k_ar.ar_arg_argv, length); ar->k_ar.ar_arg_argc = argc; ARG_SET_VALID(ar, ARG_ARGV); } /* * Audit the environment strings passed to exec. */ void audit_arg_envv(char *envv, int envc, int length) { struct kaudit_record *ar; if (audit_arge == 0) return; ar = currecord(); if (ar == NULL) return; ar->k_ar.ar_arg_envv = malloc(length, M_AUDITTEXT, M_WAITOK); bcopy(envv, ar->k_ar.ar_arg_envv, length); ar->k_ar.ar_arg_envc = envc; ARG_SET_VALID(ar, ARG_ENVV); } /* * The close() system call uses it's own audit call to capture the path/vnode * information because those pieces are not easily obtained within the system * call itself. */ void audit_sysclose(struct thread *td, int fd) { struct kaudit_record *ar; struct vnode *vp; struct file *fp; int vfslocked; KASSERT(td != NULL, ("audit_sysclose: td == NULL")); ar = currecord(); if (ar == NULL) return; audit_arg_fd(fd); if (getvnode(td->td_proc->p_fd, fd, &fp) != 0) return; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); audit_arg_vnode(vp, ARG_VNODE1); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); } Index: head/sys/security/audit/audit_bsm_klib.c =================================================================== --- head/sys/security/audit/audit_bsm_klib.c (revision 175201) +++ head/sys/security/audit/audit_bsm_klib.c (revision 175202) @@ -1,547 +1,547 @@ /* * Copyright (c) 1999-2005 Apple Computer, Inc. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Hash table functions for the audit event number to event class mask * mapping. */ #define EVCLASSMAP_HASH_TABLE_SIZE 251 struct evclass_elem { au_event_t event; au_class_t class; LIST_ENTRY(evclass_elem) entry; }; struct evclass_list { LIST_HEAD(, evclass_elem) head; }; static MALLOC_DEFINE(M_AUDITEVCLASS, "audit_evclass", "Audit event class"); static struct mtx evclass_mtx; static struct evclass_list evclass_hash[EVCLASSMAP_HASH_TABLE_SIZE]; /* * Look up the class for an audit event in the class mapping table. */ au_class_t au_event_class(au_event_t event) { struct evclass_list *evcl; struct evclass_elem *evc; au_class_t class; mtx_lock(&evclass_mtx); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; class = 0; LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { class = evc->class; goto out; } } out: mtx_unlock(&evclass_mtx); return (class); } /* * Insert a event to class mapping. If the event already exists in the * mapping, then replace the mapping with the new one. * * XXX There is currently no constraints placed on the number of mappings. * May want to either limit to a number, or in terms of memory usage. */ void au_evclassmap_insert(au_event_t event, au_class_t class) { struct evclass_list *evcl; struct evclass_elem *evc, *evc_new; /* * Pessimistically, always allocate storage before acquiring mutex. * Free if there is already a mapping for this event. */ evc_new = malloc(sizeof(*evc), M_AUDITEVCLASS, M_WAITOK); mtx_lock(&evclass_mtx); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { evc->class = class; mtx_unlock(&evclass_mtx); free(evc_new, M_AUDITEVCLASS); return; } } evc = evc_new; evc->event = event; evc->class = class; LIST_INSERT_HEAD(&evcl->head, evc, entry); mtx_unlock(&evclass_mtx); } void au_evclassmap_init(void) { int i; mtx_init(&evclass_mtx, "evclass_mtx", NULL, MTX_DEF); for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++) LIST_INIT(&evclass_hash[i].head); /* * Set up the initial event to class mapping for system calls. * * XXXRW: Really, this should walk all possible audit events, not all * native ABI system calls, as there may be audit events reachable * only through non-native system calls. It also seems a shame to * frob the mutex this early. */ for (i = 0; i < SYS_MAXSYSCALL; i++) { if (sysent[i].sy_auevent != AUE_NULL) au_evclassmap_insert(sysent[i].sy_auevent, 0); } } /* * Check whether an event is aditable by comparing the mask of classes this * event is part of against the given mask. */ int au_preselect(au_event_t event, au_class_t class, au_mask_t *mask_p, int sorf) { au_class_t effmask = 0; if (mask_p == NULL) return (-1); /* * Perform the actual check of the masks against the event. */ if (sorf & AU_PRS_SUCCESS) effmask |= (mask_p->am_success & class); if (sorf & AU_PRS_FAILURE) effmask |= (mask_p->am_failure & class); if (effmask) return (1); else return (0); } /* * Convert sysctl names and present arguments to events. */ au_event_t ctlname_to_sysctlevent(int name[], uint64_t valid_arg) { /* can't parse it - so return the worst case */ if ((valid_arg & (ARG_CTLNAME | ARG_LEN)) != (ARG_CTLNAME | ARG_LEN)) return (AUE_SYSCTL); switch (name[0]) { /* non-admin "lookups" treat them special */ case KERN_OSTYPE: case KERN_OSRELEASE: case KERN_OSREV: case KERN_VERSION: case KERN_ARGMAX: case KERN_CLOCKRATE: case KERN_BOOTTIME: case KERN_POSIX1: case KERN_NGROUPS: case KERN_JOB_CONTROL: case KERN_SAVED_IDS: case KERN_OSRELDATE: case KERN_DUMMY: return (AUE_SYSCTL_NONADMIN); /* only treat the changeable controls as admin */ case KERN_MAXVNODES: case KERN_MAXPROC: case KERN_MAXFILES: case KERN_MAXPROCPERUID: case KERN_MAXFILESPERPROC: case KERN_HOSTID: case KERN_SECURELVL: case KERN_HOSTNAME: case KERN_VNODE: case KERN_PROC: case KERN_FILE: case KERN_PROF: case KERN_NISDOMAINNAME: case KERN_UPDATEINTERVAL: case KERN_NTP_PLL: case KERN_BOOTFILE: case KERN_DUMPDEV: case KERN_IPC: case KERN_PS_STRINGS: case KERN_USRSTACK: case KERN_LOGSIGEXIT: case KERN_IOV_MAX: case KERN_MAXID: return ((valid_arg & ARG_VALUE) ? AUE_SYSCTL : AUE_SYSCTL_NONADMIN); default: return (AUE_SYSCTL); } /* NOTREACHED */ } /* * Convert an open flags specifier into a specific type of open event for * auditing purposes. */ au_event_t flags_and_error_to_openevent(int oflags, int error) { au_event_t aevent; /* * Need to check only those flags we care about. */ oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY); /* * These checks determine what flags are on with the condition that * ONLY that combination is on, and no other flags are on. */ switch (oflags) { case O_RDONLY: aevent = AUE_OPEN_R; break; case (O_RDONLY | O_CREAT): aevent = AUE_OPEN_RC; break; case (O_RDONLY | O_CREAT | O_TRUNC): aevent = AUE_OPEN_RTC; break; case (O_RDONLY | O_TRUNC): aevent = AUE_OPEN_RT; break; case O_RDWR: aevent = AUE_OPEN_RW; break; case (O_RDWR | O_CREAT): aevent = AUE_OPEN_RWC; break; case (O_RDWR | O_CREAT | O_TRUNC): aevent = AUE_OPEN_RWTC; break; case (O_RDWR | O_TRUNC): aevent = AUE_OPEN_RWT; break; case O_WRONLY: aevent = AUE_OPEN_W; break; case (O_WRONLY | O_CREAT): aevent = AUE_OPEN_WC; break; case (O_WRONLY | O_CREAT | O_TRUNC): aevent = AUE_OPEN_WTC; break; case (O_WRONLY | O_TRUNC): aevent = AUE_OPEN_WT; break; default: aevent = AUE_OPEN; break; } #if 0 /* * Convert chatty errors to better matching events. Failures to * find a file are really just attribute events -- so recast them as * such. * * XXXAUDIT: Solaris defines that AUE_OPEN will never be returned, it * is just a placeholder. However, in Darwin we return that in * preference to other events. For now, comment this out as we don't * have a BSM conversion routine for AUE_OPEN. */ switch (aevent) { case AUE_OPEN_R: case AUE_OPEN_RT: case AUE_OPEN_RW: case AUE_OPEN_RWT: case AUE_OPEN_W: case AUE_OPEN_WT: if (error == ENOENT) aevent = AUE_OPEN; } #endif return (aevent); } /* * Convert a MSGCTL command to a specific event. */ int msgctl_to_event(int cmd) { switch (cmd) { case IPC_RMID: return (AUE_MSGCTL_RMID); case IPC_SET: return (AUE_MSGCTL_SET); case IPC_STAT: return (AUE_MSGCTL_STAT); default: /* We will audit a bad command. */ return (AUE_MSGCTL); } } /* * Convert a SEMCTL command to a specific event. */ int semctl_to_event(int cmd) { switch (cmd) { case GETALL: return (AUE_SEMCTL_GETALL); case GETNCNT: return (AUE_SEMCTL_GETNCNT); case GETPID: return (AUE_SEMCTL_GETPID); case GETVAL: return (AUE_SEMCTL_GETVAL); case GETZCNT: return (AUE_SEMCTL_GETZCNT); case IPC_RMID: return (AUE_SEMCTL_RMID); case IPC_SET: return (AUE_SEMCTL_SET); case SETALL: return (AUE_SEMCTL_SETALL); case SETVAL: return (AUE_SEMCTL_SETVAL); case IPC_STAT: return (AUE_SEMCTL_STAT); default: /* We will audit a bad command */ return (AUE_SEMCTL); } } /* * Convert a command for the auditon() system call to a audit event. */ int auditon_command_event(int cmd) { switch(cmd) { case A_GETPOLICY: return (AUE_AUDITON_GPOLICY); case A_SETPOLICY: return (AUE_AUDITON_SPOLICY); case A_GETKMASK: return (AUE_AUDITON_GETKMASK); case A_SETKMASK: return (AUE_AUDITON_SETKMASK); case A_GETQCTRL: return (AUE_AUDITON_GQCTRL); case A_SETQCTRL: return (AUE_AUDITON_SQCTRL); case A_GETCWD: return (AUE_AUDITON_GETCWD); case A_GETCAR: return (AUE_AUDITON_GETCAR); case A_GETSTAT: return (AUE_AUDITON_GETSTAT); case A_SETSTAT: return (AUE_AUDITON_SETSTAT); case A_SETUMASK: return (AUE_AUDITON_SETUMASK); case A_SETSMASK: return (AUE_AUDITON_SETSMASK); case A_GETCOND: return (AUE_AUDITON_GETCOND); case A_SETCOND: return (AUE_AUDITON_SETCOND); case A_GETCLASS: return (AUE_AUDITON_GETCLASS); case A_SETCLASS: return (AUE_AUDITON_SETCLASS); case A_GETPINFO: case A_SETPMASK: case A_SETFSIZE: case A_GETFSIZE: case A_GETPINFO_ADDR: case A_GETKAUDIT: case A_SETKAUDIT: default: return (AUE_AUDITON); /* No special record */ } } /* * Create a canonical path from given path by prefixing either the root * directory, or the current working directory. If the process working * directory is NULL, we could use 'rootvnode' to obtain the root directory, * but this results in a volfs name written to the audit log. So we will * leave the filename starting with '/' in the audit log in this case. * * XXXRW: Since we combine two paths here, ideally a buffer of size * MAXPATHLEN * 2 would be passed in. */ void canon_path(struct thread *td, char *path, char *cpath) { char *bufp; char *retbuf, *freebuf; struct vnode *vnp; struct filedesc *fdp; int cisr, error, vfslocked; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "canon_path() at %s:%d", __FILE__, __LINE__); fdp = td->td_proc->p_fd; bufp = path; cisr = 0; FILEDESC_SLOCK(fdp); if (*(path) == '/') { while (*(bufp) == '/') bufp++; /* Skip leading '/'s. */ /* * If no process root, or it is the same as the system root, * audit the path as passed in with a single '/'. */ if ((fdp->fd_rdir == NULL) || (fdp->fd_rdir == rootvnode)) { vnp = NULL; bufp--; /* Restore one '/'. */ } else { vnp = fdp->fd_rdir; /* Use process root. */ vref(vnp); } } else { vnp = fdp->fd_cdir; /* Prepend the current dir. */ cisr = (fdp->fd_rdir == fdp->fd_cdir); vref(vnp); bufp = path; } FILEDESC_SUNLOCK(fdp); if (vnp != NULL) { /* * XXX: vn_fullpath() on FreeBSD is "less reliable" than * vn_getpath() on Darwin, so this will need more attention * in the future. Also, the question and string bounding * here seems a bit questionable and will also require * attention. */ vfslocked = VFS_LOCK_GIANT(vnp->v_mount); - vn_lock(vnp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vnp, LK_EXCLUSIVE | LK_RETRY); error = vn_fullpath(td, vnp, &retbuf, &freebuf); if (error == 0) { /* Copy and free buffer allocated by vn_fullpath(). * If the current working directory was the same as * the root directory, and the path was a relative * pathname, do not separate the two components with * the '/' character. */ snprintf(cpath, MAXPATHLEN, "%s%s%s", retbuf, cisr ? "" : "/", bufp); free(freebuf, M_TEMP); } else cpath[0] = '\0'; vput(vnp); VFS_UNLOCK_GIANT(vfslocked); } else strlcpy(cpath, bufp, MAXPATHLEN); } Index: head/sys/security/audit/audit_worker.c =================================================================== --- head/sys/security/audit/audit_worker.c (revision 175201) +++ head/sys/security/audit/audit_worker.c (revision 175202) @@ -1,551 +1,551 @@ /* * Copyright (c) 1999-2005 Apple Computer, Inc. * Copyright (c) 2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Worker thread that will schedule disk I/O, etc. */ static struct proc *audit_thread; /* * When an audit log is rotated, the actual rotation must be performed by the * audit worker thread, as it may have outstanding writes on the current * audit log. audit_replacement_vp holds the vnode replacing the current * vnode. We can't let more than one replacement occur at a time, so if more * than one thread requests a replacement, only one can have the replacement * "in progress" at any given moment. If a thread tries to replace the audit * vnode and discovers a replacement is already in progress (i.e., * audit_replacement_flag != 0), then it will sleep on audit_replacement_cv * waiting its turn to perform a replacement. When a replacement is * completed, this cv is signalled by the worker thread so a waiting thread * can start another replacement. We also store a credential to perform * audit log write operations with. * * The current credential and vnode are thread-local to audit_worker. */ static struct cv audit_replacement_cv; static int audit_replacement_flag; static struct vnode *audit_replacement_vp; static struct ucred *audit_replacement_cred; /* * Flags related to Kernel->user-space communication. */ static int audit_file_rotate_wait; /* * Write an audit record to a file, performed as the last stage after both * preselection and BSM conversion. Both space management and write failures * are handled in this function. * * No attempt is made to deal with possible failure to deliver a trigger to * the audit daemon, since the message is asynchronous anyway. */ static void audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td, void *data, size_t len) { static struct timeval last_lowspace_trigger; static struct timeval last_fail; static int cur_lowspace_trigger; struct statfs *mnt_stat; int error, vfslocked; static int cur_fail; struct vattr vattr; long temp; if (vp == NULL) return; mnt_stat = &vp->v_mount->mnt_stat; vfslocked = VFS_LOCK_GIANT(vp->v_mount); /* * First, gather statistics on the audit log file and file system so * that we know how we're doing on space. Consider failure of these * operations to indicate a future inability to write to the file. */ error = VFS_STATFS(vp->v_mount, mnt_stat, td); if (error) goto fail; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); if (error) goto fail; audit_fstat.af_currsz = vattr.va_size; /* * We handle four different space-related limits: * * - A fixed (hard) limit on the minimum free blocks we require on * the file system, and results in record loss, a trigger, and * possible fail stop due to violating invariants. * * - An administrative (soft) limit, which when fallen below, results * in the kernel notifying the audit daemon of low space. * * - An audit trail size limit, which when gone above, results in the * kernel notifying the audit daemon that rotation is desired. * * - The total depth of the kernel audit record exceeding free space, * which can lead to possible fail stop (with drain), in order to * prevent violating invariants. Failure here doesn't halt * immediately, but prevents new records from being generated. * * Possibly, the last of these should be handled differently, always * allowing a full queue to be lost, rather than trying to prevent * loss. * * First, handle the hard limit, which generates a trigger and may * fail stop. This is handled in the same manner as ENOSPC from * VOP_WRITE, and results in record loss. */ if (mnt_stat->f_bfree < AUDIT_HARD_LIMIT_FREE_BLOCKS) { error = ENOSPC; goto fail_enospc; } /* * Second, handle falling below the soft limit, if defined; we send * the daemon a trigger and continue processing the record. Triggers * are limited to 1/sec. */ if (audit_qctrl.aq_minfree != 0) { /* * XXXAUDIT: Check math and block size calculations here. */ temp = mnt_stat->f_blocks / (100 / audit_qctrl.aq_minfree); if (mnt_stat->f_bfree < temp) { if (ppsratecheck(&last_lowspace_trigger, &cur_lowspace_trigger, 1)) { (void)send_trigger(AUDIT_TRIGGER_LOW_SPACE); printf("Warning: audit space low\n"); } } } /* * If the current file is getting full, generate a rotation trigger * to the daemon. This is only approximate, which is fine as more * records may be generated before the daemon rotates the file. */ if ((audit_fstat.af_filesz != 0) && (audit_file_rotate_wait == 0) && (vattr.va_size >= audit_fstat.af_filesz)) { audit_file_rotate_wait = 1; (void)send_trigger(AUDIT_TRIGGER_ROTATE_KERNEL); } /* * If the estimated amount of audit data in the audit event queue * (plus records allocated but not yet queued) has reached the amount * of free space on the disk, then we need to go into an audit fail * stop state, in which we do not permit the allocation/committing of * any new audit records. We continue to process records but don't * allow any activities that might generate new records. In the * future, we might want to detect when space is available again and * allow operation to continue, but this behavior is sufficient to * meet fail stop requirements in CAPP. */ if (audit_fail_stop) { if ((unsigned long)((audit_q_len + audit_pre_q_len + 1) * MAX_AUDIT_RECORD_SIZE) / mnt_stat->f_bsize >= (unsigned long)(mnt_stat->f_bfree)) { if (ppsratecheck(&last_fail, &cur_fail, 1)) printf("audit_record_write: free space " "below size of audit queue, failing " "stop\n"); audit_in_failure = 1; } else if (audit_in_failure) { /* * Note: if we want to handle recovery, this is the * spot to do it: unset audit_in_failure, and issue a * wakeup on the cv. */ } } error = vn_rdwr(UIO_WRITE, vp, data, len, (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td); if (error == ENOSPC) goto fail_enospc; else if (error) goto fail; /* * Catch completion of a queue drain here; if we're draining and the * queue is now empty, fail stop. That audit_fail_stop is implicitly * true, since audit_in_failure can only be set of audit_fail_stop is * set. * * Note: if we handle recovery from audit_in_failure, then we need to * make panic here conditional. */ if (audit_in_failure) { if (audit_q_len == 0 && audit_pre_q_len == 0) { VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); (void)VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); panic("Audit store overflow; record queue drained."); } } VFS_UNLOCK_GIANT(vfslocked); return; fail_enospc: /* * ENOSPC is considered a special case with respect to failures, as * this can reflect either our preemptive detection of insufficient * space, or ENOSPC returned by the vnode write call. */ if (audit_fail_stop) { VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); (void)VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); panic("Audit log space exhausted and fail-stop set."); } (void)send_trigger(AUDIT_TRIGGER_NO_SPACE); audit_suspended = 1; /* FALLTHROUGH */ fail: /* * We have failed to write to the file, so the current record is * lost, which may require an immediate system halt. */ if (audit_panic_on_write_fail) { VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); (void)VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); panic("audit_worker: write error %d\n", error); } else if (ppsratecheck(&last_fail, &cur_fail, 1)) printf("audit_worker: write error %d\n", error); VFS_UNLOCK_GIANT(vfslocked); } /* * If an appropriate signal has been received rotate the audit log based on * the global replacement variables. Signal consumers as needed that the * rotation has taken place. * * The global variables and CVs used to signal the audit_worker to perform a * rotation are essentially a message queue of depth 1. It would be much * nicer to actually use a message queue. */ static void audit_worker_rotate(struct ucred **audit_credp, struct vnode **audit_vpp, struct thread *audit_td) { int do_replacement_signal, vfslocked; struct ucred *old_cred; struct vnode *old_vp; mtx_assert(&audit_mtx, MA_OWNED); do_replacement_signal = 0; while (audit_replacement_flag != 0) { old_cred = *audit_credp; old_vp = *audit_vpp; *audit_credp = audit_replacement_cred; *audit_vpp = audit_replacement_vp; audit_replacement_cred = NULL; audit_replacement_vp = NULL; audit_replacement_flag = 0; audit_enabled = (*audit_vpp != NULL); if (old_vp != NULL) { mtx_unlock(&audit_mtx); vfslocked = VFS_LOCK_GIANT(old_vp->v_mount); vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred, audit_td); VFS_UNLOCK_GIANT(vfslocked); crfree(old_cred); mtx_lock(&audit_mtx); old_cred = NULL; old_vp = NULL; } do_replacement_signal = 1; } /* * Signal that replacement have occurred to wake up and start any * other replacements started in parallel. We can continue about our * business in the mean time. We broadcast so that both new * replacements can be inserted, but also so that the source(s) of * replacement can return successfully. */ if (do_replacement_signal) cv_broadcast(&audit_replacement_cv); } /* * Given a kernel audit record, process as required. Kernel audit records * are converted to one, or possibly two, BSM records, depending on whether * there is a user audit record present also. Kernel records need be * converted to BSM before they can be written out. Both types will be * written to disk, and audit pipes. */ static void audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred, struct thread *audit_td, struct kaudit_record *ar) { struct au_record *bsm; au_class_t class; au_event_t event; au_id_t auid; int error, sorf; /* * First, handle the user record, if any: commit to the system trail * and audit pipes as selected. */ if ((ar->k_ar_commit & AR_COMMIT_USER) && (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) audit_record_write(audit_vp, audit_cred, audit_td, ar->k_udata, ar->k_ulen); if ((ar->k_ar_commit & AR_COMMIT_USER) && (ar->k_ar_commit & AR_PRESELECT_USER_PIPE)) audit_pipe_submit_user(ar->k_udata, ar->k_ulen); if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) || ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 && (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0)) return; auid = ar->k_ar.ar_subj_auid; event = ar->k_ar.ar_event; class = au_event_class(event); if (ar->k_ar.ar_errno == 0) sorf = AU_PRS_SUCCESS; else sorf = AU_PRS_FAILURE; error = kaudit_to_bsm(ar, &bsm); switch (error) { case BSM_NOAUDIT: return; case BSM_FAILURE: printf("audit_worker_process_record: BSM_FAILURE\n"); return; case BSM_SUCCESS: break; default: panic("kaudit_to_bsm returned %d", error); } if (ar->k_ar_commit & AR_PRESELECT_TRAIL) audit_record_write(audit_vp, audit_cred, audit_td, bsm->data, bsm->len); if (ar->k_ar_commit & AR_PRESELECT_PIPE) audit_pipe_submit(auid, event, class, sorf, ar->k_ar_commit & AR_PRESELECT_TRAIL, bsm->data, bsm->len); kau_free(bsm); } /* * The audit_worker thread is responsible for watching the event queue, * dequeueing records, converting them to BSM format, and committing them to * disk. In order to minimize lock thrashing, records are dequeued in sets * to a thread-local work queue. In addition, the audit_work performs the * actual exchange of audit log vnode pointer, as audit_vp is a thread-local * variable. */ static void audit_worker(void *arg) { struct kaudit_queue ar_worklist; struct kaudit_record *ar; struct ucred *audit_cred; struct thread *audit_td; struct vnode *audit_vp; int lowater_signal; /* * These are thread-local variables requiring no synchronization. */ TAILQ_INIT(&ar_worklist); audit_cred = NULL; audit_td = curthread; audit_vp = NULL; mtx_lock(&audit_mtx); while (1) { mtx_assert(&audit_mtx, MA_OWNED); /* * Wait for record or rotation events. */ while (!audit_replacement_flag && TAILQ_EMPTY(&audit_q)) cv_wait(&audit_worker_cv, &audit_mtx); /* * First priority: replace the audit log target if requested. */ audit_worker_rotate(&audit_cred, &audit_vp, audit_td); /* * If there are records in the global audit record queue, * transfer them to a thread-local queue and process them * one by one. If we cross the low watermark threshold, * signal any waiting processes that they may wake up and * continue generating records. */ lowater_signal = 0; while ((ar = TAILQ_FIRST(&audit_q))) { TAILQ_REMOVE(&audit_q, ar, k_q); audit_q_len--; if (audit_q_len == audit_qctrl.aq_lowater) lowater_signal++; TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q); } if (lowater_signal) cv_broadcast(&audit_watermark_cv); mtx_unlock(&audit_mtx); while ((ar = TAILQ_FIRST(&ar_worklist))) { TAILQ_REMOVE(&ar_worklist, ar, k_q); audit_worker_process_record(audit_vp, audit_cred, audit_td, ar); audit_free(ar); } mtx_lock(&audit_mtx); } } /* * audit_rotate_vnode() is called by a user or kernel thread to configure or * de-configure auditing on a vnode. The arguments are the replacement * credential and vnode to substitute for the current credential and vnode, * if any. If either is set to NULL, both should be NULL, and this is used * to indicate that audit is being disabled. The real work is done in the * audit_worker thread, but audit_rotate_vnode() waits synchronously for that * to complete. * * The vnode should be referenced and opened by the caller. The credential * should be referenced. audit_rotate_vnode() will own both references as of * this call, so the caller should not release either. * * XXXAUDIT: Review synchronize communication logic. Really, this is a * message queue of depth 1. We are essentially acquiring ownership of the * communications queue, inserting our message, and waiting for an * acknowledgement. */ void audit_rotate_vnode(struct ucred *cred, struct vnode *vp) { /* * If other parallel log replacements have been requested, we wait * until they've finished before continuing. */ mtx_lock(&audit_mtx); while (audit_replacement_flag != 0) cv_wait(&audit_replacement_cv, &audit_mtx); audit_replacement_cred = cred; audit_replacement_flag = 1; audit_replacement_vp = vp; /* * Wake up the audit worker to perform the exchange once we release * the mutex. */ cv_signal(&audit_worker_cv); /* * Wait for the audit_worker to broadcast that a replacement has * taken place; we know that once this has happened, our vnode has * been replaced in, so we can return successfully. */ cv_wait(&audit_replacement_cv, &audit_mtx); audit_file_rotate_wait = 0; /* We can now request another rotation */ mtx_unlock(&audit_mtx); } void audit_worker_init(void) { int error; cv_init(&audit_replacement_cv, "audit_replacement_cv"); error = kproc_create(audit_worker, NULL, &audit_thread, RFHIGHPID, 0, "audit"); if (error) panic("audit_worker_init: kproc_create returned %d", error); } Index: head/sys/security/mac/mac_process.c =================================================================== --- head/sys/security/mac/mac_process.c (revision 175201) +++ head/sys/security/mac/mac_process.c (revision 175202) @@ -1,633 +1,633 @@ /*- * Copyright (c) 1999-2002 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2003 Networks Associates Technology, Inc. * Copyright (c) 2005 Samy Al Bahra * Copyright (c) 2006 SPARTA, Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int mac_mmap_revocation = 1; SYSCTL_INT(_security_mac, OID_AUTO, mmap_revocation, CTLFLAG_RW, &mac_mmap_revocation, 0, "Revoke mmap access to files on subject " "relabel"); static int mac_mmap_revocation_via_cow = 0; SYSCTL_INT(_security_mac, OID_AUTO, mmap_revocation_via_cow, CTLFLAG_RW, &mac_mmap_revocation_via_cow, 0, "Revoke mmap access to files via " "copy-on-write semantics, or by removing all write access"); static void mac_cred_mmapped_drop_perms_recurse(struct thread *td, struct ucred *cred, struct vm_map *map); struct label * mac_cred_label_alloc(void) { struct label *label; label = mac_labelzone_alloc(M_WAITOK); MAC_PERFORM(cred_init_label, label); return (label); } void mac_cred_init(struct ucred *cred) { cred->cr_label = mac_cred_label_alloc(); } static struct label * mac_proc_label_alloc(void) { struct label *label; label = mac_labelzone_alloc(M_WAITOK); MAC_PERFORM(proc_init_label, label); return (label); } void mac_proc_init(struct proc *p) { p->p_label = mac_proc_label_alloc(); } void mac_cred_label_free(struct label *label) { MAC_PERFORM(cred_destroy_label, label); mac_labelzone_free(label); } void mac_cred_destroy(struct ucred *cred) { mac_cred_label_free(cred->cr_label); cred->cr_label = NULL; } static void mac_proc_label_free(struct label *label) { MAC_PERFORM(proc_destroy_label, label); mac_labelzone_free(label); } void mac_proc_destroy(struct proc *p) { mac_proc_label_free(p->p_label); p->p_label = NULL; } int mac_cred_externalize_label(struct label *label, char *elements, char *outbuf, size_t outbuflen) { int error; MAC_EXTERNALIZE(cred, label, elements, outbuf, outbuflen); return (error); } int mac_cred_internalize_label(struct label *label, char *string) { int error; MAC_INTERNALIZE(cred, label, string); return (error); } /* * Initialize MAC label for the first kernel process, from which other kernel * processes and threads are spawned. */ void mac_proc_create_swapper(struct ucred *cred) { MAC_PERFORM(proc_create_swapper, cred); } /* * Initialize MAC label for the first userland process, from which other * userland processes and threads are spawned. */ void mac_proc_create_init(struct ucred *cred) { MAC_PERFORM(proc_create_init, cred); } /* * When a thread becomes an NFS server daemon, its credential may need to be * updated to reflect this so that policies can recognize when file system * operations originate from the network. * * At some point, it would be desirable if the credential used for each NFS * RPC could be set based on the RPC context (i.e., source system, etc) to * provide more fine-grained access control. */ void mac_proc_associate_nfsd(struct ucred *cred) { MAC_PERFORM(proc_associate_nfsd, cred); } void mac_thread_userret(struct thread *td) { MAC_PERFORM(thread_userret, td); } /* * When a new process is created, its label must be initialized. Generally, * this involves inheritence from the parent process, modulo possible deltas. * This function allows that processing to take place. */ void mac_cred_copy(struct ucred *src, struct ucred *dest) { MAC_PERFORM(cred_copy_label, src->cr_label, dest->cr_label); } int mac_execve_enter(struct image_params *imgp, struct mac *mac_p) { struct label *label; struct mac mac; char *buffer; int error; if (mac_p == NULL) return (0); error = copyin(mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } label = mac_cred_label_alloc(); error = mac_cred_internalize_label(label, buffer); free(buffer, M_MACTEMP); if (error) { mac_cred_label_free(label); return (error); } imgp->execlabel = label; return (0); } void mac_execve_exit(struct image_params *imgp) { if (imgp->execlabel != NULL) { mac_cred_label_free(imgp->execlabel); imgp->execlabel = NULL; } } /* * When relabeling a process, call out to the policies for the maximum * permission allowed for each object type we know about in its memory space, * and revoke access (in the least surprising ways we know) when necessary. * The process lock is not held here. */ void mac_cred_mmapped_drop_perms(struct thread *td, struct ucred *cred) { /* XXX freeze all other threads */ mac_cred_mmapped_drop_perms_recurse(td, cred, &td->td_proc->p_vmspace->vm_map); /* XXX allow other threads to continue */ } static __inline const char * prot2str(vm_prot_t prot) { switch (prot & VM_PROT_ALL) { case VM_PROT_READ: return ("r--"); case VM_PROT_READ | VM_PROT_WRITE: return ("rw-"); case VM_PROT_READ | VM_PROT_EXECUTE: return ("r-x"); case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: return ("rwx"); case VM_PROT_WRITE: return ("-w-"); case VM_PROT_EXECUTE: return ("--x"); case VM_PROT_WRITE | VM_PROT_EXECUTE: return ("-wx"); default: return ("---"); } } static void mac_cred_mmapped_drop_perms_recurse(struct thread *td, struct ucred *cred, struct vm_map *map) { struct vm_map_entry *vme; int vfslocked, result; vm_prot_t revokeperms; vm_object_t backing_object, object; vm_ooffset_t offset; struct vnode *vp; struct mount *mp; if (!mac_mmap_revocation) return; vm_map_lock_read(map); for (vme = map->header.next; vme != &map->header; vme = vme->next) { if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) { mac_cred_mmapped_drop_perms_recurse(td, cred, vme->object.sub_map); continue; } /* * Skip over entries that obviously are not shared. */ if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) || !vme->max_protection) continue; /* * Drill down to the deepest backing object. */ offset = vme->offset; object = vme->object.vm_object; if (object == NULL) continue; VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_UNLOCK(object); object = backing_object; } VM_OBJECT_UNLOCK(object); /* * At the moment, vm_maps and objects aren't considered by * the MAC system, so only things with backing by a normal * object (read: vnodes) are checked. */ if (object->type != OBJT_VNODE) continue; vp = (struct vnode *)object->handle; vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); result = vme->max_protection; mac_vnode_check_mmap_downgrade(cred, vp, &result); VOP_UNLOCK(vp, 0, td); /* * Find out what maximum protection we may be allowing now * but a policy needs to get removed. */ revokeperms = vme->max_protection & ~result; if (!revokeperms) { VFS_UNLOCK_GIANT(vfslocked); continue; } printf("pid %ld: revoking %s perms from %#lx:%ld " "(max %s/cur %s)\n", (long)td->td_proc->p_pid, prot2str(revokeperms), (u_long)vme->start, (long)(vme->end - vme->start), prot2str(vme->max_protection), prot2str(vme->protection)); vm_map_lock_upgrade(map); /* * This is the really simple case: if a map has more * max_protection than is allowed, but it's not being * actually used (that is, the current protection is still * allowed), we can just wipe it out and do nothing more. */ if ((vme->protection & revokeperms) == 0) { vme->max_protection -= revokeperms; } else { if (revokeperms & VM_PROT_WRITE) { /* * In the more complicated case, flush out all * pending changes to the object then turn it * copy-on-write. */ vm_object_reference(object); (void) vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_LOCK(object); vm_object_page_clean(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + vme->end - vme->start + PAGE_MASK), OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); vm_object_deallocate(object); /* * Why bother if there's no read permissions * anymore? For the rest, we need to leave * the write permissions on for COW, or * remove them entirely if configured to. */ if (!mac_mmap_revocation_via_cow) { vme->max_protection &= ~VM_PROT_WRITE; vme->protection &= ~VM_PROT_WRITE; } if ((revokeperms & VM_PROT_READ) == 0) vme->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; } if (revokeperms & VM_PROT_EXECUTE) { vme->max_protection &= ~VM_PROT_EXECUTE; vme->protection &= ~VM_PROT_EXECUTE; } if (revokeperms & VM_PROT_READ) { vme->max_protection = 0; vme->protection = 0; } pmap_protect(map->pmap, vme->start, vme->end, vme->protection & ~revokeperms); vm_map_simplify_entry(map, vme); } vm_map_lock_downgrade(map); VFS_UNLOCK_GIANT(vfslocked); } vm_map_unlock_read(map); } /* * When the subject's label changes, it may require revocation of privilege * to mapped objects. This can't be done on-the-fly later with a unified * buffer cache. */ void mac_cred_relabel(struct ucred *cred, struct label *newlabel) { MAC_PERFORM(cred_relabel, cred, newlabel); } int mac_cred_check_relabel(struct ucred *cred, struct label *newlabel) { int error; MAC_CHECK(cred_check_relabel, cred, newlabel); return (error); } int mac_cred_check_visible(struct ucred *cr1, struct ucred *cr2) { int error; MAC_CHECK(cred_check_visible, cr1, cr2); return (error); } int mac_proc_check_debug(struct ucred *cred, struct proc *p) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_debug, cred, p); return (error); } int mac_proc_check_sched(struct ucred *cred, struct proc *p) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_sched, cred, p); return (error); } int mac_proc_check_signal(struct ucred *cred, struct proc *p, int signum) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_signal, cred, p, signum); return (error); } int mac_proc_check_setuid(struct proc *p, struct ucred *cred, uid_t uid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setuid, cred, uid); return (error); } int mac_proc_check_seteuid(struct proc *p, struct ucred *cred, uid_t euid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_seteuid, cred, euid); return (error); } int mac_proc_check_setgid(struct proc *p, struct ucred *cred, gid_t gid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setgid, cred, gid); return (error); } int mac_proc_check_setegid(struct proc *p, struct ucred *cred, gid_t egid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setegid, cred, egid); return (error); } int mac_proc_check_setgroups(struct proc *p, struct ucred *cred, int ngroups, gid_t *gidset) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setgroups, cred, ngroups, gidset); return (error); } int mac_proc_check_setreuid(struct proc *p, struct ucred *cred, uid_t ruid, uid_t euid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setreuid, cred, ruid, euid); return (error); } int mac_proc_check_setregid(struct proc *proc, struct ucred *cred, gid_t rgid, gid_t egid) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); MAC_CHECK(proc_check_setregid, cred, rgid, egid); return (error); } int mac_proc_check_setresuid(struct proc *p, struct ucred *cred, uid_t ruid, uid_t euid, uid_t suid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setresuid, cred, ruid, euid, suid); return (error); } int mac_proc_check_setresgid(struct proc *p, struct ucred *cred, gid_t rgid, gid_t egid, gid_t sgid) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_setresgid, cred, rgid, egid, sgid); return (error); } int mac_proc_check_wait(struct ucred *cred, struct proc *p) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); MAC_CHECK(proc_check_wait, cred, p); return (error); } Index: head/sys/security/mac/mac_syscalls.c =================================================================== --- head/sys/security/mac/mac_syscalls.c (revision 175201) +++ head/sys/security/mac/mac_syscalls.c (revision 175202) @@ -1,702 +1,702 @@ /*- * Copyright (c) 1999-2002, 2006 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2005-2006 SPARTA, Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC int __mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { char *elements, *buffer; struct mac mac; struct proc *tproc; struct ucred *tcred; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); tproc = pfind(uap->pid); if (tproc == NULL) return (ESRCH); tcred = NULL; /* Satisfy gcc. */ error = p_cansee(td, tproc); if (error == 0) tcred = crhold(tproc->p_ucred); PROC_UNLOCK(tproc); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); crfree(tcred); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(tcred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); crfree(tcred); return (error); } int __mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { char *elements, *buffer; struct mac mac; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(td->td_ucred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int __mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { struct ucred *newcred, *oldcred; struct label *intlabel; struct proc *p; struct mac mac; char *buffer; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_cred_label_alloc(); error = mac_cred_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; error = mac_cred_check_relabel(oldcred, intlabel); if (error) { PROC_UNLOCK(p); crfree(newcred); goto out; } setsugid(p); crcopy(newcred, oldcred); mac_cred_relabel(newcred, intlabel); p->p_ucred = newcred; /* * Grab additional reference for use while revoking mmaps, prior to * releasing the proc lock and sharing the cred. */ crhold(newcred); PROC_UNLOCK(p); mac_cred_mmapped_drop_perms(td, newcred); crfree(newcred); /* Free revocation reference. */ crfree(oldcred); out: mac_cred_label_free(intlabel); return (error); } int __mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { char *elements, *buffer; struct label *intlabel; struct file *fp; struct mac mac; struct vnode *vp; struct pipe *pipe; struct socket *so; short label_type; int vfslocked, error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = fget(td, uap->fd, &fp); if (error) goto out; label_type = fp->f_type; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: vp = fp->f_vnode; intlabel = mac_vnode_label_alloc(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mac_vnode_copy_label(vp->v_label, intlabel); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: pipe = fp->f_data; intlabel = mac_pipe_label_alloc(); PIPE_LOCK(pipe); mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel); PIPE_UNLOCK(pipe); error = mac_pipe_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: so = fp->f_data; intlabel = mac_socket_label_alloc(M_WAITOK); SOCK_LOCK(so); mac_socket_copy_label(so->so_label, intlabel); SOCK_UNLOCK(so); error = mac_socket_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_socket_label_free(intlabel); break; default: error = EINVAL; } fdrop(fp, td); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int __mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { char *elements, *buffer; struct nameidata nd; struct label *intlabel; struct mac mac; int vfslocked, error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); if (error) goto out; intlabel = mac_vnode_label_alloc(); vfslocked = NDHASGIANT(&nd); mac_vnode_copy_label(nd.ni_vp->v_label, intlabel); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); NDFREE(&nd, 0); VFS_UNLOCK_GIANT(vfslocked); mac_vnode_label_free(intlabel); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int __mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { char *elements, *buffer; struct nameidata nd; struct label *intlabel; struct mac mac; int vfslocked, error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); if (error) goto out; intlabel = mac_vnode_label_alloc(); vfslocked = NDHASGIANT(&nd); mac_vnode_copy_label(nd.ni_vp->v_label, intlabel); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); NDFREE(&nd, 0); VFS_UNLOCK_GIANT(vfslocked); mac_vnode_label_free(intlabel); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int __mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { struct label *intlabel; struct pipe *pipe; struct socket *so; struct file *fp; struct mount *mp; struct vnode *vp; struct mac mac; char *buffer; int error, vfslocked; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } error = fget(td, uap->fd, &fp); if (error) goto out; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); if (error) { mac_vnode_label_free(intlabel); break; } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) { VFS_UNLOCK_GIANT(vfslocked); mac_vnode_label_free(intlabel); break; } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vn_setlabel(vp, intlabel, td->td_ucred); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: intlabel = mac_pipe_label_alloc(); error = mac_pipe_internalize_label(intlabel, buffer); if (error == 0) { pipe = fp->f_data; PIPE_LOCK(pipe); error = mac_pipe_label_set(td->td_ucred, pipe->pipe_pair, intlabel); PIPE_UNLOCK(pipe); } mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: intlabel = mac_socket_label_alloc(M_WAITOK); error = mac_socket_internalize_label(intlabel, buffer); if (error == 0) { so = fp->f_data; error = mac_socket_label_set(td->td_ucred, so, intlabel); } mac_socket_label_free(intlabel); break; default: error = EINVAL; } fdrop(fp, td); out: free(buffer, M_MACTEMP); return (error); } int __mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { struct label *intlabel; struct nameidata nd; struct mount *mp; struct mac mac; char *buffer; int vfslocked, error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); if (error == 0) { error = vn_setlabel(nd.ni_vp, intlabel, td->td_ucred); vn_finished_write(mp); } } NDFREE(&nd, 0); VFS_UNLOCK_GIANT(vfslocked); out: mac_vnode_label_free(intlabel); return (error); } int __mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { struct label *intlabel; struct nameidata nd; struct mount *mp; struct mac mac; char *buffer; int vfslocked, error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); if (error == 0) { error = vn_setlabel(nd.ni_vp, intlabel, td->td_ucred); vn_finished_write(mp); } } NDFREE(&nd, 0); VFS_UNLOCK_GIANT(vfslocked); out: mac_vnode_label_free(intlabel); return (error); } int mac_syscall(struct thread *td, struct mac_syscall_args *uap) { struct mac_policy_conf *mpc; char target[MAC_MAX_POLICY_NAME]; int entrycount, error; error = copyinstr(uap->policy, target, sizeof(target), NULL); if (error) return (error); error = ENOSYS; LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); goto out; } } if ((entrycount = mac_policy_list_conditional_busy()) != 0) { LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); break; } } mac_policy_list_unbusy(); } out: return (error); } #else /* !MAC */ int __mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { return (ENOSYS); } int __mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { return (ENOSYS); } int __mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { return (ENOSYS); } int __mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { return (ENOSYS); } int __mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { return (ENOSYS); } int __mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { return (ENOSYS); } int __mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { return (ENOSYS); } int __mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { return (ENOSYS); } int __mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { return (ENOSYS); } int mac_syscall(struct thread *td, struct mac_syscall_args *uap) { return (ENOSYS); } #endif /* !MAC */ Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 175201) +++ head/sys/sys/vnode.h (revision 175202) @@ -1,742 +1,742 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ /* * XXX - compatability until lockmgr() goes away or all the #includes are * updated. */ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * f - freelist mutex * G - Giant * i - interlock * m - mntvnodes mutex * p - pollinfo lock * s - spechash mutex * S - syncer mutex * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). * * XXX Not all fields are locked yet and some fields that are marked are not * locked consistently. This is a work in progress. Requires Giant! */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type; /* u vnode type */ const char *v_tag; /* u type of underlying data */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. * See #defines below for renaming to v_* namespace. */ union { struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ struct socket *vu_socket; /* v unix domain net (VSOCK) */ struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ } v_un; /* * vfs_hash: (mount + inode) -> vnode hash. */ LIST_ENTRY(vnode) v_hashlist; u_int v_hash; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct vnode *v_dd; /* c .. vnode */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ int v_holdcnt; /* i prevents recycling. */ int v_usecount; /* i ref count of users */ u_long v_iflag; /* i vnode flags (see below) */ u_long v_vflag; /* v vnode flags */ int v_writecount; /* v ref count of writers */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_freelist; /* f vnode freelist */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define v_mountedhere v_un.vu_mount #define v_socket v_un.vu_socket #define v_rdev v_un.vu_cdev #define v_fifoinfo v_un.vu_fifoinfo /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* socket, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, 1) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_AGE 0x0040 /* Insert vnode at head of free list */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_OBJDIRTY 0x0400 /* object might be dirty */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* filesystem id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_MARK_ATIME 0x04 /* setting atime for execve/mmap */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VEXEC 000100 /* execute/search permission */ #define VWRITE 000200 /* write permission */ #define VREAD 000400 /* read permission */ #define VSVTX 001000 /* save swapped text even after use */ #define VSGID 002000 /* set group id on execution */ #define VSUID 004000 /* set user id on execution */ #define VADMIN 010000 /* permission to administer */ #define VSTAT 020000 /* permission to retrieve attrs */ #define VAPPEND 040000 /* permission to write/append */ #define VALLPERM (VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int async_io_version; /* 0 or POSIX version of AIO i'face */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_updatetime)(int deltat); #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str); #endif void assert_vop_locked(struct vnode *vp, const char *str); #if 0 voi0 assert_vop_slocked(struct vnode *vp, const char *str); #endif void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str)) #endif #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) assert_vop_slocked((vp), (str)) #endif #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) #define ASSERT_VI_UNLOCKED(vp, str) #define ASSERT_VOP_ELOCKED(vp, str) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) #endif #define ASSERT_VOP_LOCKED(vp, str) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) #endif #define ASSERT_VOP_UNLOCKED(vp, str) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ #include "vnode_if.h" /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vnode; extern int (*lease_check_hook)(struct vop_lease_args *); /* cache_* may belong in namei.h. */ void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); void cache_purge(struct vnode *vp); void cache_purgevfs(struct mount *mp); int change_dir(struct vnode *vp, struct thread *td); int change_root(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int lease_check(struct vop_lease_args *ap); int speedup_syncer(void); #define textvp_fullpath(p, rb, rfb) \ vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb) int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, mode_t acc_mode, struct ucred *cred, int *privused); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); int vinvalbuf(struct vnode *vp, int save, struct thread *td, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); #define vprint(label, vp) vn_printf((vp), "%s\n", (label)) int vrecycle(struct vnode *vp, struct thread *td); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_isdisk(struct vnode *vp, int *errp); -int _vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line); -#define vn_lock(vp, flags, td) _vn_lock(vp, flags, td, __FILE__, __LINE__) +int _vn_lock(struct vnode *vp, int flags, char *file, int line); +#define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, struct ucred *cred, struct file *fp); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, int *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_write_suspend_wait(struct vnode *vp, struct mount *mp, int flags); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp); int vfs_write_suspend(struct mount *mp); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); /* These are called from within the actual VOPS. */ void vop_create_post(void *a, int rc); void vop_link_post(void *a, int rc); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_post(void *a, int rc); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_strategy_pre(void *a); void vop_symlink_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error, osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred, \ curthread); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags, td) VOP_LOCK1(vp, flags, td, __FILE__, __LINE__) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct thread *td); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 175201) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 175202) @@ -1,2527 +1,2527 @@ /*- * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { return (EINVAL); } int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { return (EINVAL); } void ffs_snapremove(vp) struct vnode *vp; { } void ffs_snapshot_mount(mp) struct mount *mp; { } void ffs_snapshot_unmount(mp) struct mount *mp; { } void ffs_snapgone(ip) struct inode *ip; { } int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { return (EINVAL); } #else TAILQ_HEAD(snaphead, inode); struct snapdata { struct snaphead sn_head; daddr_t sn_listsize; daddr_t *sn_blklist; struct lock sn_lock; }; static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void process_deferred_inactive(struct mount *); static void try_free_snapdata(struct vnode *devvp, struct thread *td); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the following flag allows this * crash persistence to be disabled. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; int flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp, *sbp = NULL; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); VOP_UNLOCK(nd.ni_dvp, 0, td); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); return (error); } vp = nd.ni_vp; vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ip->i_devvp; /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP_SET(ip, i_size, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = readblock(vp, bp, numblks - 1); bawrite(bp); if (error != 0) goto out; /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) goto out; VOP_UNLOCK(vp, 0, td); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); PROC_SLOCK(p); saved_nice = p->p_nice; sched_nice(p, 0); PROC_SUNLOCK(p); PROC_UNLOCK(p); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto out1; } if (collectsnapstats) nanotime(&starttime); /* The last block might have changed. Copy it again to be sure. */ error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error != 0) goto out1; error = readblock(vp, bp, numblks - 1); bp->b_flags |= B_VALIDSUSPWRT; bawrite(bp); if (error != 0) goto out1; /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &sbp); if (error) { brelse(sbp); sbp = NULL; goto out1; } loc = blkoff(fs, fs->fs_sblockloc); copy_fs = (struct fs *)(sbp->b_data + loc); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; loop: MNT_VNODE_FOREACH(xvp, mp, mvp) { VI_LOCK(xvp); MNT_IUNLOCK(mp); if ((xvp->v_iflag & VI_DOOMED) || (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } vholdl(xvp); - if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { + if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp, 0, td); vdrop(xvp); MNT_ILOCK(mp); continue; } VI_UNLOCK(xvp); if (snapdebug) vprint("ffs_snapshot: busy vnode", xvp); if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0, td); vdrop(xvp); MNT_ILOCK(mp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0, td); vdrop(xvp); MNT_ILOCK(mp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode); VOP_UNLOCK(xvp, 0, td); vdrop(xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; MNT_VNODE_FOREACH_ABORT(mp, mvp); goto out1; } MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the snapshots * to share. In either case, acquire the snapshot lock and give * up our original private lock. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn != NULL) { xp = TAILQ_FIRST(&sn->sn_head); VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; } else { VI_UNLOCK(devvp); sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; mp_fixme("si_snapdata setting is racey."); devvp->v_rdev->si_snapdata = sn; xp = NULL; } lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(vp), td); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %d already on list", ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); out1: KASSERT((sn != NULL && sbp != NULL && error == 0) || (sn == NULL && sbp == NULL && error != 0), ("email phk@ and mckusick@")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount); vn_start_write(NULL, &wrtmp, V_WAIT); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (sbp == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (ip->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); if (error) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len;; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) FREE(space, M_UFSMNT); /* * If another process is currently writing the buffer containing * the inode for this snapshot then a deadlock can occur. Drop * the snapshot lock until the buffer has been written. */ VREF(vp); /* Protect against ffs_snapgone() */ VOP_UNLOCK(vp, 0, td); (void) bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int) fs->fs_bsize, NOCRED, &nbp); brelse(nbp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) error = ENOENT; /* Snapshot file unlinked */ else vrele(vp); /* Drop extra reference */ done: FREE(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (saved_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); PROC_SLOCK(p); sched_nice(td->td_proc, saved_nice); PROC_SUNLOCK(p); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { FREE(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); if (error) (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td); (void) ffs_syncvnode(vp, MNT_WAIT); if (error) vput(vp); else VOP_UNLOCK(vp, 0, td); vrele(nd.ni_dvp); vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ip->i_fs; error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (EIO); } UFS_LOCK(ip->i_ump); ACTIVESET(fs, cg); UFS_UNLOCK(ip->i_ump); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < NDADDR) { for ( ; loc < NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = (base + loc - NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = 0; } if (ip->i_ump->um_fstype == UFS1) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ip->i_devvp->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %d\n", ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ fs = ip->i_fs; ump = ip->i_ump; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; struct thread *td = curthread; ufs2_daddr_t numblks, blkno, dblk; int error, loc, last; struct snapdata *sn; ip = VTOI(vp); fs = ip->i_fs; devvp = ip->i_devvp; /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td); VI_LOCK(vp); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapremove: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); VI_LOCK(devvp); lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td); try_free_snapdata(devvp, td); } else VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (ip->i_ump->um_fstype == UFS1) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); /* * Lookup block being written. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %d lbn %jd from inum %d\n", "Grabonremove: snapino", ip->i_number, (intmax_t)lbn, inum); #endif if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); return (1); } if (lbn >= NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", "Copyonremove: snapino ", ip->i_number, (intmax_t)lbn, "for inum", inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = devvp->v_rdev->si_snapdata; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if ((ip->i_flags & SF_SNAPSHOT) == 0) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td); (void)ffs_syncvnode(vp, MNT_WAIT); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the * snapshots to share. In either case, acquire the snapshot * lock and give up our original private lock. */ VI_LOCK(devvp); if (sn != NULL) { VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; } else { VI_UNLOCK(devvp); sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; devvp->v_rdev->si_snapdata = sn; } lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(vp), td); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %d already on list", ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0, td); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0, td); return; } MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0, td); FREE(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0, td); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; struct thread *td = curthread; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp), td); VI_LOCK(vp); lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(vp), td); VI_LOCK(vp); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapshot_unmount: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td); if (xp->i_effnlink > 0) vrele(vp); VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp, td); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = TAILQ_FIRST(&sn->sn_head)->i_fs; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(bo, bp) struct bufobj *bo; struct buf *bp; { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo->__bo_vnode; KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { if (BO_MTX(bo) != VI_MTX(vp)) { VI_UNLOCK(devvp); BO_LOCK(bo); } BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = 0; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ip->i_fs; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef INVARIANTS if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %d lbn %jd for ", ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %d", VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } return (error); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(vp, bp, lbn) struct vnode *vp; struct buf *bp; ufs2_daddr_t lbn; { struct inode *ip = VTOI(vp); struct bio *bip; bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; bip->bio_done = NULL; g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); bp->b_error = biowait(bip, "snaprdb"); g_destroy_bio(bip); return (bp->b_error); } /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ static void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; struct thread *td; int error; td = curthread; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); vholdl(vp); - error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); + error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error != 0) { vdrop(vp); MNT_ILOCK(mp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; ip->i_flag |= IN_MODIFIED; } VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); continue; } VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("process_deferred_inactive: " "recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); (void) VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("process_deferred_inactive: lost VI_DOINGINACT")); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("process_deferred_inactive: got VI_OWEINACT")); vp->v_iflag &= ~VI_DOINGINACT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_secondary_write(mp); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp, struct thread *td) { struct snapdata *sn; ufs2_daddr_t *snapblklist; sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) { VI_UNLOCK(devvp); return; } devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td); lockdestroy(&sn->sn_lock); free(sn, M_UFSMNT); if (snapblklist != NULL) FREE(snapblklist, M_UFSMNT); } #endif Index: head/sys/ufs/ffs/ffs_softdep.c =================================================================== --- head/sys/ufs/ffs/ffs_softdep.c (revision 175201) +++ head/sys/ufs/ffs/ffs_softdep.c (revision 175202) @@ -1,6302 +1,6302 @@ /*- * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * Further information about soft updates can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 */ #include __FBSDID("$FreeBSD$"); /* * For now we want the safety net that the DEBUG flag provides. */ #ifndef DEBUG #define DEBUG #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_ffs.h" #include "opt_quota.h" #ifndef SOFTUPDATES int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { panic("softdep_flushfiles called"); } int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { return (0); } void softdep_initialize() { return; } void softdep_uninitialize() { return; } void softdep_setup_inomapdep(bp, ip, newinum) struct buf *bp; struct inode *ip; ino_t newinum; { panic("softdep_setup_inomapdep called"); } void softdep_setup_blkmapdep(bp, mp, newblkno) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; { panic("softdep_setup_blkmapdep called"); } void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocdirect called"); } void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocext called"); } void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; ufs_lbn_t lbn; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; struct buf *nbp; { panic("softdep_setup_allocindir_page called"); } void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; struct inode *ip; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; { panic("softdep_setup_allocindir_meta called"); } void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; off_t length; int flags; { panic("softdep_setup_freeblocks called"); } void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { panic("softdep_freefile called"); } int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; struct inode *dp; off_t diroffset; ino_t newinum; struct buf *newdirbp; int isnewblk; { panic("softdep_setup_directory_add called"); } void softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) struct inode *dp; caddr_t base; caddr_t oldloc; caddr_t newloc; int entrysize; { panic("softdep_change_directoryentry_offset called"); } void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; int isrmdir; { panic("softdep_setup_remove called"); } void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; ino_t newinum; int isrmdir; { panic("softdep_setup_directory_change called"); } void softdep_change_linkcnt(ip) struct inode *ip; { panic("softdep_change_linkcnt called"); } void softdep_load_inodeblock(ip) struct inode *ip; { panic("softdep_load_inodeblock called"); } void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; struct buf *bp; int waitfor; { panic("softdep_update_inodeblock called"); } int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { return (0); } void softdep_fsync_mountdev(vp) struct vnode *vp; { return; } int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { *countp = 0; return (0); } int softdep_sync_metadata(struct vnode *vp) { return (0); } int softdep_slowdown(vp) struct vnode *vp; { panic("softdep_slowdown called"); } void softdep_releasefile(ip) struct inode *ip; /* inode with the zero effective link count */ { panic("softdep_releasefile called"); } int softdep_request_cleanup(fs, vp) struct fs *fs; struct vnode *vp; { return (0); } int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_deps, int softdep_accdeps, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; int error; (void) softdep_deps, (void) softdep_accdeps; ASSERT_VI_LOCKED(devvp, "softdep_check_suspend"); bo = &devvp->v_bufobj; for (;;) { if (!MNT_ITRYLOCK(mp)) { VI_UNLOCK(devvp); MNT_ILOCK(mp); MNT_IUNLOCK(mp); VI_LOCK(devvp); continue; } if (mp->mnt_secondary_writes != 0) { VI_UNLOCK(devvp); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); VI_LOCK(devvp); continue; } break; } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; VI_UNLOCK(devvp); return (error); } void softdep_get_depcounts(struct mount *mp, int *softdepactivep, int *softdepactiveaccp) { (void) mp; *softdepactivep = 0; *softdepactiveaccp = 0; } #else /* * These definitions need to be adapted to the system to which * this file is being ported. */ /* * malloc types defined for the softdep system. */ static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) #define D_PAGEDEP 0 #define D_INODEDEP 1 #define D_NEWBLK 2 #define D_BMSAFEMAP 3 #define D_ALLOCDIRECT 4 #define D_INDIRDEP 5 #define D_ALLOCINDIR 6 #define D_FREEFRAG 7 #define D_FREEBLKS 8 #define D_FREEFILE 9 #define D_DIRADD 10 #define D_MKDIR 11 #define D_DIRREM 12 #define D_NEWDIRBLK 13 #define D_LAST D_NEWDIRBLK /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, M_NEWBLK, M_BMSAFEMAP, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM, M_NEWDIRBLK }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ /* * Forward declarations. */ struct inodedep_hashhead; struct newblk_hashhead; struct pagedep_hashhead; /* * Internal function prototypes. */ static void softdep_error(char *, int); static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct mtx *, int); static void clear_remove(struct thread *); static void clear_inodedeps(struct thread *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int flush_inodedep_deps(struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int handle_written_filepage(struct pagedep *, struct buf *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_inodeblock(struct inodedep *, struct buf *); static void handle_allocdirect_partdone(struct allocdirect *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); static void handle_written_mkdir(struct mkdir *, int); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static void handle_workitem_remove(struct dirrem *, struct vnode *); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); static void free_diradd(struct diradd *); static void free_allocindir(struct allocindir *, struct inodedep *); static void free_newdirblk(struct newdirblk *); static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, ufs2_daddr_t *); static void deallocate_dependencies(struct buf *, struct inodedep *); static void free_allocdirect(struct allocdirectlst *, struct allocdirect *, int); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void handle_workitem_freeblocks(struct freeblks *, int); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static void setup_allocindir_phase2(struct buf *, struct inode *, struct allocindir *); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, struct newblk **); static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct mount *mp, int, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int process_worklist_item(struct mount *, int); static void add_to_worklist(struct worklist *); static void softdep_flush(void); static int softdep_speedup(void); /* * Exported softdep operations. */ static void softdep_disk_io_initiation(struct buf *); static void softdep_disk_write_complete(struct buf *); static void softdep_deallocate_dependencies(struct buf *); static int softdep_count_dependencies(struct buf *bp, int); static struct mtx lk; MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) #define ACQUIRE_LOCK(lk) mtx_lock(lk) #define FREE_LOCK(lk) mtx_unlock(lk) /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ DEBUG #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #else /* DEBUG */ static void worklist_insert(struct workhead *, struct worklist *); static void worklist_remove(struct worklist *); #define WORKLIST_INSERT(head, item) worklist_insert(head, item) #define WORKLIST_REMOVE(item) worklist_remove(item) static void worklist_insert(head, item) struct workhead *head; struct worklist *item; { mtx_assert(&lk, MA_OWNED); if (item->wk_state & ONWORKLIST) panic("worklist_insert: already on list"); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item) struct worklist *item; { mtx_assert(&lk, MA_OWNED); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: not on list"); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); static void workitem_alloc(struct worklist *, int, struct mount *); #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) static void workitem_free(item, type) struct worklist *item; int type; { struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); #ifdef DEBUG if (item->wk_state & ONWORKLIST) panic("workitem_free: still on list"); if (item->wk_type != type) panic("workitem_free: type mismatch"); #endif ump = VFSTOUFS(item->wk_mp); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); FREE(item, DtoM(type)); } static void workitem_alloc(item, type, mp) struct worklist *item; int type; struct mount *mp; { item->wk_type = type; item->wk_mp = mp; item->wk_state = 0; ACQUIRE_LOCK(&lk); VFSTOUFS(mp)->softdep_deps++; VFSTOUFS(mp)->softdep_accdeps++; FREE_LOCK(&lk); } /* * Workitem queue management */ static int max_softdeps; /* maximum number of structs before slowdown */ static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static int *stat_countp; /* statistic to count in proc_waiting timeout */ static struct callout_handle handle; /* handle on posted proc_waiting timeout */ static int req_pending; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ #define FLUSH_INODES 1 static int req_clear_remove; /* syncer process flush some freeblks */ #define FLUSH_REMOVE 2 #define FLUSH_REMOVE_WAIT 3 /* * runtime statistics */ static int stat_worklist_push; /* number of worklist cleanups */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, ""); SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */ SYSCTL_DECL(_vfs_ffs); static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); static struct proc *softdepproc; static struct kproc_desc softdep_kp = { "softdepflush", softdep_flush, &softdepproc }; SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp) static void softdep_flush(void) { struct mount *nmp; struct mount *mp; struct ufsmount *ump; struct thread *td; int remaining; int vfslocked; td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; for (;;) { kproc_suspend_check(softdepproc); vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); ACQUIRE_LOCK(&lk); /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(td); req_clear_inodedeps -= 1; wakeup_one(&proc_waiting); } if (req_clear_remove) { clear_remove(td); req_clear_remove -= 1; wakeup_one(&proc_waiting); } FREE_LOCK(&lk); VFS_UNLOCK_GIANT(vfslocked); remaining = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); if ((mp->mnt_flag & MNT_SOFTDEP) == 0) continue; if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) continue; vfslocked = VFS_LOCK_GIANT(mp); softdep_process_worklist(mp, 0); ump = VFSTOUFS(mp); remaining += ump->softdep_on_worklist - ump->softdep_on_worklist_inprogress; VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (remaining) continue; ACQUIRE_LOCK(&lk); if (!req_pending) msleep(&req_pending, &lk, PVM, "sdflush", hz); req_pending = 0; FREE_LOCK(&lk); } } static int softdep_speedup(void) { mtx_assert(&lk, MA_OWNED); if (req_pending == 0) { req_pending = 1; wakeup(&req_pending); } return speedup_syncer(); } /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ static void add_to_worklist(wk) struct worklist *wk; { struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); ump = VFSTOUFS(wk->wk_mp); if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: already on list"); wk->wk_state |= ONWORKLIST; if (LIST_EMPTY(&ump->softdep_workitem_pending)) LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); else LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; ump->softdep_on_worklist += 1; } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ int softdep_process_worklist(mp, full) struct mount *mp; int full; { struct thread *td = curthread; int cnt, matchcnt, loopcount; struct ufsmount *ump; long starttime; KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); /* * Record the process identifier of our caller so that we can give * this process preferential treatment in request_cleanup below. */ matchcnt = 0; ump = VFSTOUFS(mp); ACQUIRE_LOCK(&lk); loopcount = 1; starttime = time_second; while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 0)) == -1) break; else matchcnt += cnt; /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(td); req_clear_inodedeps -= 1; wakeup_one(&proc_waiting); } if (req_clear_remove) { clear_remove(td); req_clear_remove -= 1; wakeup_one(&proc_waiting); } /* * We do not generally want to stop for buffer space, but if * we are really being a buffer hog, we will stop and wait. */ if (loopcount++ % 128 == 0) { FREE_LOCK(&lk); bwillwrite(); ACQUIRE_LOCK(&lk); } /* * Never allow processing to run for more than one * second. Otherwise the other mountpoints may get * excessively backlogged. */ if (!full && starttime != time_second) { matchcnt = -1; break; } } FREE_LOCK(&lk); return (matchcnt); } /* * Process one item on the worklist. */ static int process_worklist_item(mp, flags) struct mount *mp; int flags; { struct worklist *wk, *wkend; struct ufsmount *ump; struct vnode *vp; int matchcnt = 0; mtx_assert(&lk, MA_OWNED); KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to write as we may * recurse into the copy-on-write routine. */ if (curthread->td_pflags & TDP_COWINPROGRESS) return (-1); /* * Normally we just process each item on the worklist in order. * However, if we are in a situation where we cannot lock any * inodes, we have to skip over any dirrem requests whose * vnodes are resident and locked. */ ump = VFSTOUFS(mp); vp = NULL; LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { if (wk->wk_state & INPROGRESS) continue; if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) break; wk->wk_state |= INPROGRESS; ump->softdep_on_worklist_inprogress++; FREE_LOCK(&lk); ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum, LK_NOWAIT | LK_EXCLUSIVE, &vp); ACQUIRE_LOCK(&lk); wk->wk_state &= ~INPROGRESS; ump->softdep_on_worklist_inprogress--; if (vp != NULL) break; } if (wk == 0) return (-1); /* * Remove the item to be processed. If we are removing the last * item on the list, we need to recalculate the tail pointer. * As this happens rarely and usually when the list is short, * we just run down the list to find it rather than tracking it * in the above loop. */ WORKLIST_REMOVE(wk); if (wk == ump->softdep_worklist_tail) { LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) if (LIST_NEXT(wkend, wk_list) == NULL) break; ump->softdep_worklist_tail = wkend; } ump->softdep_on_worklist -= 1; FREE_LOCK(&lk); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); matchcnt++; switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ handle_workitem_remove(WK_DIRREM(wk), vp); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ handle_workitem_freefrag(WK_FREEFRAG(wk)); break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ handle_workitem_freefile(WK_FREEFILE(wk)); break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } vn_finished_secondary_write(mp); ACQUIRE_LOCK(&lk); return (matchcnt); } /* * Move dependencies from one buffer to another. */ void softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; if (!LIST_EMPTY(&newbp->b_dep)) panic("softdep_move_dependencies: need merge code"); wktail = 0; ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); if (wktail == 0) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else LIST_INSERT_AFTER(wktail, wk, wk_list); wktail = wk; } FREE_LOCK(&lk); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { struct vnode *devvp; int count, error = 0; struct ufsmount *ump; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. We continue until no more worklist dependencies * are found. */ *countp = 0; ump = VFSTOUFS(oldmnt); devvp = ump->um_devvp; while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { *countp += count; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp, 0, td); if (error) break; } return (error); } int softdep_waitidle(struct mount *mp) { struct ufsmount *ump; int error; int i; ump = VFSTOUFS(mp); ACQUIRE_LOCK(&lk); for (i = 0; i < 10 && ump->softdep_deps; i++) { ump->softdep_req = 1; if (ump->softdep_on_worklist) panic("softdep_waitidle: work added after flush."); msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); } ump->softdep_req = 0; FREE_LOCK(&lk); error = 0; if (i == 10) { error = EBUSY; printf("softdep_waitidle: Failed to flush worklist for %p\n", mp); } return (error); } /* * Flush all vnodes and worklist items associated with a specified mount point. */ int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { int error, count, loopcnt; error = 0; /* * Alternately flush the vnodes associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ for (loopcnt = 10; loopcnt > 0; loopcnt--) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) break; if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 || count == 0) break; } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } if (!error) error = softdep_waitidle(oldmnt); return (error); } /* * Structure hashing. * * There are three types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ #define NODELAY 0x0002 /* cannot do background work */ /* * Structures and routines associated with pagedep caching. */ LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; u_long pagedep_hash; /* size of hash table - 1 */ #define PAGEDEP_HASH(mp, inum, lbn) \ (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ pagedep_hash]) static int pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) struct pagedep_hashhead *pagedephd; ino_t ino; ufs_lbn_t lbn; struct mount *mp; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; LIST_FOREACH(pagedep, pagedephd, pd_hash) if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && mp == pagedep->pd_list.wk_mp) break; if (pagedep) { *pagedeppp = pagedep; if ((flags & DEPALLOC) != 0 && (pagedep->pd_state & ONWORKLIST) == 0) return (0); return (1); } *pagedeppp = NULL; return (0); } /* * Look up a pagedep. Return 1 if found, 0 if not found or found * when asked to allocate but not associated with any buffer. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ static int pagedep_lookup(ip, lbn, flags, pagedeppp) struct inode *ip; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct mount *mp; int ret; int i; mtx_assert(&lk, MA_OWNED); mp = ITOV(ip)->v_mount; pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); if (*pagedeppp || (flags & DEPALLOC) == 0) return (ret); FREE_LOCK(&lk); MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(&lk); ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); if (*pagedeppp) { WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } pagedep->pd_ino = ip->i_number; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; static u_long inodedep_hash; /* size of hash table - 1 */ static long num_inodedep; /* number of inodedep allocated */ #define INODEDEP_HASH(fs, inum) \ (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) static int inodedep_find(inodedephd, fs, inum, inodedeppp) struct inodedep_hashhead *inodedephd; struct fs *fs; ino_t inum; struct inodedep **inodedeppp; { struct inodedep *inodedep; LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino && fs == inodedep->id_fs) break; if (inodedep) { *inodedeppp = inodedep; return (1); } *inodedeppp = NULL; return (0); } /* * Look up an inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ static int inodedep_lookup(mp, inum, flags, inodedeppp) struct mount *mp; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; struct fs *fs; mtx_assert(&lk, MA_OWNED); fs = VFSTOUFS(mp)->um_fs; inodedephd = INODEDEP_HASH(fs, inum); if (inodedep_find(inodedephd, fs, inum, inodedeppp)) return (1); if ((flags & DEPALLOC) == 0) return (0); /* * If we are over our limit, try to improve the situation. */ if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) request_cleanup(mp, FLUSH_INODES); FREE_LOCK(&lk); MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); ACQUIRE_LOCK(&lk); if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } num_inodedep += 1; inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_buf = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); TAILQ_INIT(&inodedep->id_newextupdt); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; u_long newblk_hash; /* size of hash table - 1 */ #define NEWBLK_HASH(fs, inum) \ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) static int newblk_find(newblkhd, fs, newblkno, newblkpp) struct newblk_hashhead *newblkhd; struct fs *fs; ufs2_daddr_t newblkno; struct newblk **newblkpp; { struct newblk *newblk; LIST_FOREACH(newblk, newblkhd, nb_hash) if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) break; if (newblk) { *newblkpp = newblk; return (1); } *newblkpp = NULL; return (0); } /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(fs, newblkno, flags, newblkpp) struct fs *fs; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; newblkhd = NEWBLK_HASH(fs, newblkno); if (newblk_find(newblkhd, fs, newblkno, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(&lk); MALLOC(newblk, struct newblk *, sizeof(struct newblk), M_NEWBLK, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); if (newblk_find(newblkhd, fs, newblkno, newblkpp)) { FREE(newblk, M_NEWBLK); return (1); } newblk->nb_state = 0; newblk->nb_fs = fs; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; return (0); } /* * Executed during filesystem system initialization before * mounting any filesystems. */ void softdep_initialize() { LIST_INIT(&mkdirlisthd); max_softdeps = desiredvnodes * 4; pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; } /* * Executed after all filesystems have been unmounted during * filesystem module unload. */ void softdep_uninitialize() { hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum_total cstotal; struct ufsmount *ump; struct cg *cgp; struct buf *bp; int error, cyl; MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | MNTK_SOFTDEP; mp->mnt_noasync++; } MNT_IUNLOCK(mp); ump = VFSTOUFS(mp); LIST_INIT(&ump->softdep_workitem_pending); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation * can take a long time and can be deferred for background * fsck. However, the old behavior of scanning the cylinder * groups and recalculating them at mount time is available * by setting vfs.ffs.compute_summary_at_mount to one. */ if (compute_summary_at_mount == 0 || fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef DEBUG if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs filesystem maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ { struct inodedep *inodedep; struct bmsafemap *bmsafemap; /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, &inodedep))) panic("softdep_setup_inomapdep: dependency for new inode " "already exists"); inodedep->id_buf = bp; inodedep->id_state &= ~DEPCOMPLETE; bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); FREE_LOCK(&lk); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, mp, newblkno) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ { struct newblk *newblk; struct bmsafemap *bmsafemap; struct fs *fs; fs = VFSTOUFS(mp)->um_fs; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); FREE_LOCK(&lk); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * splbio interrupts blocked. */ static struct bmsafemap * bmsafemap_lookup(mp, bp) struct mount *mp; struct buf *bp; { struct bmsafemap *bmsafemap; struct worklist *wk; mtx_assert(&lk, MA_OWNED); LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_BMSAFEMAP) return (WK_BMSAFEMAP(wk)); FREE_LOCK(&lk); MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_allocdirecthd); LIST_INIT(&bmsafemap->sm_allocindirhd); LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_newblkhd); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t lbn; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct pagedep *pagedep; struct newblk *newblk; struct mount *mp; mp = UFSTOVFS(ip->i_ump); MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); adp->ad_lbn = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED; LIST_INIT(&adp->ad_newdirblk); if (newblkno == oldblkno) adp->ad_freefrag = NULL; else adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); ACQUIRE_LOCK(&lk); if (lbn >= NDADDR) { /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; adp->ad_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_lbn <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_lbn >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } /* * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct worklist *wk; struct freefrag *freefrag; struct newdirblk *newdirblk; mtx_assert(&lk, MA_OWNED); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_lbn >= NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_allocdirect. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } /* * If we are tracking a new directory-block allocation, * move it from the old allocdirect to the new allocdirect. */ if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); if (!LIST_EMPTY(&oldadp->ad_newdirblk)) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); } free_allocdirect(adphead, oldadp, 0); } /* * Allocate a new freefrag structure if needed. */ static struct freefrag * newfreefrag(ip, blkno, size) struct inode *ip; ufs2_daddr_t blkno; long size; { struct freefrag *freefrag; struct fs *fs; if (blkno == 0) return (NULL); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); freefrag->ff_inum = ip->i_number; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(&lk); } /* * Set up a dependency structure for an external attributes data block. * This routine follows much of the structure of softdep_setup_allocdirect. * See the description of softdep_setup_allocdirect above for details. */ void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct newblk *newblk; struct mount *mp; mp = UFSTOVFS(ip->i_ump); MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); adp->ad_lbn = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED | EXTDATA; LIST_INIT(&adp->ad_newdirblk); if (newblkno == oldblkno) adp->ad_freefrag = NULL; else adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); ACQUIRE_LOCK(&lk); if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; adp->ad_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); if (lbn >= NXADDR) panic("softdep_setup_allocext: lbn %lld > NXADDR", (long long)lbn); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_lbn <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_lbn >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ { struct allocindir *aip; MALLOC(aip, struct allocindir *, sizeof(struct allocindir), M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump)); aip->ai_state = ATTACHED; aip->ai_offset = ptrno; aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct allocindir *aip; struct pagedep *pagedep; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); aip = newallocindir(ip, ptrno, newblkno, oldblkno); ACQUIRE_LOCK(&lk); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); setup_allocindir_phase2(bp, ip, aip); FREE_LOCK(&lk); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { struct allocindir *aip; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); setup_allocindir_phase2(bp, ip, aip); FREE_LOCK(&lk); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void setup_allocindir_phase2(bp, ip, aip) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct allocindir *aip; /* allocindir allocated by the above routines */ { struct worklist *wk; struct indirdep *indirdep, *newindirdep; struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; struct newblk *newblk; ufs2_daddr_t blkno; mtx_assert(&lk, MA_OWNED); if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); for (indirdep = NULL, newindirdep = NULL; ; ) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); newindirdep = NULL; } if (indirdep) { if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, &newblk) == 0) panic("setup_allocindir: lost block"); if (newblk->nb_state == DEPCOMPLETE) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; aip->ai_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, aip, ai_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); aip->ai_indirdep = indirdep; /* * Check to see if there is an existing dependency * for this block. If there is, merge the old * dependency into the new one. */ if (aip->ai_oldblkno == 0) oldaip = NULL; else LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) if (oldaip->ai_offset == aip->ai_offset) break; freefrag = NULL; if (oldaip != NULL) { if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("setup_allocindir_phase2: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = aip->ai_freefrag; aip->ai_freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = NULL; free_allocindir(oldaip, NULL); } LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); if (ip->i_ump->um_fstype == UFS1) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; FREE_LOCK(&lk); if (freefrag != NULL) handle_workitem_freefrag(freefrag); } else FREE_LOCK(&lk); if (newindirdep) { newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; brelse(newindirdep->ir_savebp); ACQUIRE_LOCK(&lk); WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); if (indirdep) break; FREE_LOCK(&lk); } if (indirdep) { ACQUIRE_LOCK(&lk); break; } MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, UFSTOVFS(ip->i_ump)); newindirdep->ir_state = ATTACHED; if (ip->i_ump->um_fstype == UFS1) newindirdep->ir_state |= UFS1FMT; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); ACQUIRE_LOCK(&lk); } } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct vnode *vp; struct buf *bp; struct fs *fs; ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; fs = ip->i_fs; mp = UFSTOVFS(ip->i_ump); if (length != 0) panic("softdep_setup_freeblocks: non-zero length"); MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks = DIP(ip, i_blocks) - extblocks; if ((flags & IO_NORMAL) == 0) { freeblks->fb_oldsize = 0; freeblks->fb_chkcnt = 0; } else { freeblks->fb_oldsize = ip->i_size; ip->i_size = 0; DIP_SET(ip, i_size, 0); freeblks->fb_chkcnt = datablocks; for (i = 0; i < NDADDR; i++) { freeblks->fb_dblks[i] = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], 0); } for (i = 0; i < NIADDR; i++) { freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], 0); } /* * If the file was removed, then the space being freed was * accounted for then (see softdep_releasefile()). If the * file is merely being truncated, then we account for it now. */ if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ip->i_ump); } } if ((flags & IO_EXT) == 0) { freeblks->fb_oldextsize = 0; } else { freeblks->fb_oldextsize = ip->i_din2->di_extsize; ip->i_din2->di_extsize = 0; freeblks->fb_chkcnt += extblocks; for (i = 0; i < NXADDR; i++) { freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; } } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); softdep_error("softdep_setup_freeblocks", error); } if (ip->i_ump->um_fstype == UFS1) *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; else *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (delay == 0), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ if (flags & IO_NORMAL) { merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) free_allocdirect(&inodedep->id_inoupdt, adp, delay); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) free_allocdirect(&inodedep->id_extupdt, adp, delay); } FREE_LOCK(&lk); bdwrite(bp); /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); VI_LOCK(vp); drain_output(vp); restart: TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) continue; if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL) goto restart; VI_UNLOCK(vp); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); deallocate_dependencies(bp, inodedep); FREE_LOCK(&lk); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); VI_LOCK(vp); goto restart; } VI_UNLOCK(vp); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); if(delay) { freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk * we can start freeing blocks. Add freeblks to the worklist * instead of calling handle_workitem_freeblocks directly as * it is more likely that additional IO is needed to complete * the request here than in the !delay case. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freeblks->fb_list); } FREE_LOCK(&lk); /* * If the inode has never been written to disk (delay == 0), * then we can process the freeblks now that we have deleted * the dependencies. */ if (!delay) handle_workitem_freeblocks(freeblks, 0); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static void deallocate_dependencies(bp, inodedep) struct buf *bp; struct inodedep *inodedep; { struct worklist *wk; struct indirdep *indirdep; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("deallocate_dependencies: already gone"); indirdep->ir_state |= GOINGAWAY; VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) free_allocindir(aip, inodedep); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); WORKLIST_REMOVE(wk); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* * None of the directory additions will ever be * visible, so they can simply be tossed. */ for (i = 0; i < DAHASHSZ; i++) while ((dap = LIST_FIRST(&pagedep->pd_diraddhd[i]))) free_diradd(dap); while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) free_diradd(dap); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. * If the inode has already been written, then they * can be dumped directly onto the work list. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&dirrem->dm_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } if ((pagedep->pd_state & NEWBLOCK) != 0) { LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) if (wk->wk_type == D_NEWDIRBLK && WK_NEWDIRBLK(wk)->db_pagedep == pagedep) break; if (wk != NULL) { WORKLIST_REMOVE(wk); free_newdirblk(WK_NEWDIRBLK(wk)); } else panic("deallocate_dependencies: " "lost pagedep"); } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); continue; case D_ALLOCINDIR: free_allocindir(WK_ALLOCINDIR(wk), inodedep); continue; case D_ALLOCDIRECT: case D_INODEDEP: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ default: panic("deallocate_dependencies: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } } /* * Free an allocdirect. Generate a new freefrag work request if appropriate. * This routine must be called with splbio interrupts blocked. */ static void free_allocdirect(adphead, adp, delay) struct allocdirectlst *adphead; struct allocdirect *adp; int delay; { struct newdirblk *newdirblk; struct worklist *wk; mtx_assert(&lk, MA_OWNED); if ((adp->ad_state & DEPCOMPLETE) == 0) LIST_REMOVE(adp, ad_deps); TAILQ_REMOVE(adphead, adp, ad_next); if ((adp->ad_state & COMPLETE) == 0) WORKLIST_REMOVE(&adp->ad_list); if (adp->ad_freefrag != NULL) { if (delay) WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &adp->ad_freefrag->ff_list); else add_to_worklist(&adp->ad_freefrag->ff_list); } if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); if (!LIST_EMPTY(&adp->ad_newdirblk)) panic("free_allocdirect: extra newdirblk"); if (delay) WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &newdirblk->db_list); else free_newdirblk(newdirblk); } WORKITEM_FREE(adp, D_ALLOCDIRECT); } /* * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. * This routine must be called with splbio interrupts blocked. */ static void free_newdirblk(newdirblk) struct newdirblk *newdirblk; { struct pagedep *pagedep; struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); /* * If the pagedep is still linked onto the directory buffer * dependency chain, then some of the entries on the * pd_pendinghd list may not be committed to disk yet. In * this case, we will simply clear the NEWBLOCK flag and * let the pd_pendinghd list be processed when the pagedep * is next written. If the pagedep is no longer on the buffer * dependency chain, then all the entries on the pd_pending * list are committed to disk and we can free them here. */ pagedep = newdirblk->db_pagedep; pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap); /* * If no dependencies remain, the pagedep will be freed. */ for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) break; if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; /* * This sets up the inode de-allocation dependency. */ MALLOC(freefile, struct freefile *, sizeof(struct freefile), M_FREEFILE, M_SOFTDEP_FLAGS); workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendinginodes += 1; UFS_UNLOCK(ip->i_ump); } /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can free the file immediately. */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); handle_workitem_freefile(freefile); return; } WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); FREE_LOCK(&lk); ip->i_flag |= IN_MODIFIED; } /* * Check to see if an inode has never been written to disk. If * so free the inodedep and return success, otherwise return failure. * This routine must be called with splbio interrupts blocked. * * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We set the * ALLCOMPLETE flags since the bitmap now properly shows that the * inode is not allocated. Even if the inode is actively being * written, it has been rolled back to its zero'ed state, so we * are ensured that a zero inode is what is on the disk. For short * lived files, this change will usually result in removing all the * dependencies from the inode so that it can be freed immediately. */ static int check_inode_unwritten(inodedep) struct inodedep *inodedep; { mtx_assert(&lk, MA_OWNED); if ((inodedep->id_state & DEPCOMPLETE) != 0 || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || inodedep->id_nlinkdelta != 0) return (0); /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". */ if ((inodedep->id_state & IOSTARTED) != 0 && inodedep->id_savedino1 == NULL) return (0); inodedep->id_state |= ALLCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { FREE(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; } if (free_inodedep(inodedep) == 0) panic("check_inode_unwritten: busy inode"); return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { mtx_assert(&lk, MA_OWNED); if ((inodedep->id_state & ONWORKLIST) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); num_inodedep -= 1; return (1); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static void handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; int i, nblocks, level, bsize; ufs2_daddr_t bn, blocksreleased = 0; int error, allerror = 0; ufs_lbn_t baselbns[NIADDR], tmpval; int fs_pendingblocks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; tmpval = 1; baselbns[0] = NDADDR; for (i = 1; i < NIADDR; i++) { tmpval *= NINDIR(fs); baselbns[i] = baselbns[i - 1] + tmpval; } nblocks = btodb(fs->fs_bsize); blocksreleased = 0; /* * Release all extended attribute blocks or frags. */ if (freeblks->fb_oldextsize > 0) { for (i = (NXADDR - 1); i >= 0; i--) { if ((bn = freeblks->fb_eblks[i]) == 0) continue; bsize = sblksize(fs, freeblks->fb_oldextsize, i); ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, freeblks->fb_previousinum); blocksreleased += btodb(bsize); } } /* * Release all data blocks or frags. */ if (freeblks->fb_oldsize > 0) { /* * Indirect blocks first. */ for (level = (NIADDR - 1); level >= 0; level--) { if ((bn = freeblks->fb_iblks[level]) == 0) continue; if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), level, baselbns[level], &blocksreleased)) != 0) allerror = error; ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, fs->fs_bsize, freeblks->fb_previousinum); fs_pendingblocks += nblocks; blocksreleased += nblocks; } /* * All direct blocks or frags. */ for (i = (NDADDR - 1); i >= 0; i--) { if ((bn = freeblks->fb_dblks[i]) == 0) continue; bsize = sblksize(fs, freeblks->fb_oldsize, i); ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, freeblks->fb_previousinum); fs_pendingblocks += btodb(bsize); blocksreleased += btodb(bsize); } } UFS_LOCK(ump); fs->fs_pendingblocks -= fs_pendingblocks; UFS_UNLOCK(ump); /* * If we still have not finished background cleanup, then check * to see if the block count needs to be adjusted. */ if (freeblks->fb_chkcnt != blocksreleased && (fs->fs_flags & FS_UNCLEAN) != 0 && ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) { ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ freeblks->fb_chkcnt - blocksreleased); ip->i_flag |= IN_CHANGE; vput(vp); } #ifdef INVARIANTS if (freeblks->fb_chkcnt != blocksreleased && ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) printf("handle_workitem_freeblocks: block count\n"); if (allerror) softdep_error("handle_workitem_freeblks", allerror); #endif /* INVARIANTS */ ACQUIRE_LOCK(&lk); WORKITEM_FREE(freeblks, D_FREEBLKS); FREE_LOCK(&lk); } /* * Release blocks associated with the inode ip and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int indir_trunc(freeblks, dbn, level, lbn, countp) struct freeblks *freeblks; ufs2_daddr_t dbn; int level; ufs_lbn_t lbn; ufs2_daddr_t *countp; { struct buf *bp; struct fs *fs; struct worklist *wk; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1 = 0; ufs2_daddr_t nb, *bap2 = 0; ufs_lbn_t lbnadd; int i, nblocks, ufs1fmt; int error, allerror = 0; int fs_pendingblocks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); /* * Get buffer of block pointers to be freed. This routine is not * called until the zero'ed inode has been written, so it is safe * to free blocks as they are encountered. Because the inode has * been zero'ed, calls to bmap on these blocks will fail. So, we * have to use the on-disk address and the block device for the * filesystem to look them up. If the file was deleted before its * indirect blocks were all written to disk, the routine that set * us up (deallocate_dependencies) will have arranged to leave * a complete copy of the indirect block in memory for our use. * Otherwise we have to read the blocks in from the disk. */ #ifdef notyet bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, GB_NOCREAT); #else bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); #endif ACQUIRE_LOCK(&lk); if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { if (wk->wk_type != D_INDIRDEP || (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: lost indirdep"); WORKLIST_REMOVE(wk); WORKITEM_FREE(indirdep, D_INDIRDEP); if (!LIST_EMPTY(&bp->b_dep)) panic("indir_trunc: dangling dep"); ump->um_numindirdeps -= 1; FREE_LOCK(&lk); } else { #ifdef notyet if (bp) brelse(bp); #endif FREE_LOCK(&lk); error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } } /* * Recursively free indirect blocks. */ if (ump->um_fstype == UFS1) { ufs1fmt = 1; bap1 = (ufs1_daddr_t *)bp->b_data; } else { ufs1fmt = 0; bap2 = (ufs2_daddr_t *)bp->b_data; } nblocks = btodb(fs->fs_bsize); for (i = NINDIR(fs) - 1; i >= 0; i--) { if (ufs1fmt) nb = bap1[i]; else nb = bap2[i]; if (nb == 0) continue; if (level != 0) { if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), level - 1, lbn + (i * lbnadd), countp)) != 0) allerror = error; } ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_previousinum); fs_pendingblocks += nblocks; *countp += nblocks; } UFS_LOCK(ump); fs->fs_pendingblocks -= fs_pendingblocks; UFS_UNLOCK(ump); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (allerror); } /* * Free an allocindir. * This routine must be called with splbio interrupts blocked. */ static void free_allocindir(aip, inodedep) struct allocindir *aip; struct inodedep *inodedep; { struct freefrag *freefrag; mtx_assert(&lk, MA_OWNED); if ((aip->ai_state & DEPCOMPLETE) == 0) LIST_REMOVE(aip, ai_deps); if (aip->ai_state & ONWORKLIST) WORKLIST_REMOVE(&aip->ai_list); LIST_REMOVE(aip, ai_next); if ((freefrag = aip->ai_freefrag) != NULL) { if (inodedep == NULL) add_to_worklist(&freefrag->ff_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &freefrag->ff_list); } WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs filesystem will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ ino_t newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct allocdirect *adp; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; struct mkdir *mkdir1, *mkdir2; struct mount *mp; /* * Whiteouts have no dependencies. */ if (newinum == WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return (0); } mp = UFSTOVFS(dp->i_ump); fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); } if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); mkdir1->md_state = MKDIR_BODY; mkdir1->md_diradd = dap; MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); mkdir2->md_state = MKDIR_PARENT; mkdir2->md_diradd = dap; /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); FREE_LOCK(&lk); bdwrite(newdirbp); /* * Dependency on link count increase for parent directory */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); } else { LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } } /* * Link into parent directory pagedep to await its being written. */ if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); if (isnewblk) { /* * Directories growing into indirect blocks are rare * enough and the frequency of new block allocation * in those cases even more rare, that we choose not * to bother tracking them. Rather we simply force the * new directory entry to disk. */ if (lbn >= NDADDR) { FREE_LOCK(&lk); /* * We only have a new allocation when at the * beginning of a new block, not when we are * expanding into an existing block. */ if (blkoff(fs, diroffset) == 0) return (1); return (0); } /* * We only have a new allocation when at the beginning * of a new fragment, not when we are expanding into an * existing fragment. Also, there is nothing to do if we * are already tracking this block. */ if (fragoff(fs, diroffset) != 0) { FREE_LOCK(&lk); return (0); } if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(&lk); return (0); } /* * Find our associated allocdirect and have it track us. */ if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) panic("softdep_setup_directory_add: lost inodedep"); adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); if (adp == NULL || adp->ad_lbn != lbn) panic("softdep_setup_directory_add: lost entry"); pagedep->pd_state |= NEWBLOCK; newdirblk->db_pagedep = pagedep; WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); } FREE_LOCK(&lk); return (0); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct diradd *dap; ufs_lbn_t lbn; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { if (dap->da_offset != oldoffset) continue; dap->da_offset = newoffset; if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) break; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], dap, da_pdlist); break; } if (dap == NULL) { LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { if (dap->da_offset == oldoffset) { dap->da_offset = newoffset; break; } } } done: bcopy(oldloc, newloc, entrysize); FREE_LOCK(&lk); } /* * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ static void free_diradd(dap) struct diradd *dap; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; mtx_assert(&lk, MA_OWNED); WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) (void) free_inodedep(inodedep); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~mkdir->md_state; WORKLIST_REMOVE(&mkdir->md_list); LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to a zeroed entry until * the new inode is committed to disk. If the COMPLETE flag is * set then we have deleted an entry that never made it to * disk. If the entry we deleted resulted from a name change, * then the old name still resides on disk. We cannot delete * its inode (returned to us in prevdirrem) until the zeroed * directory entry gets to disk. The new inode has never been * referenced on the disk, so can be deleted immediately. */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); FREE_LOCK(&lk); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; FREE_LOCK(&lk); handle_workitem_remove(dirrem, NULL); } } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static long num_dirrem; /* number of dirrem allocated */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); /* * If we are over our limit, try to improve the situation. * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(&lk); if (num_dirrem > max_softdeps / 2) (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); num_dirrem += 1; FREE_LOCK(&lk); MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. Check for an entry on both the pd_dirraddhd * list and the pd_pendinghd list. */ LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) break; if (dap == NULL) { LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) break; if (dap == NULL) return (dirrem); } /* * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %d should be %d", ip->i_number, dap->da_newinum); /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). */ if ((dap->da_state & DIRCHG) != 0) { *prevdirremp = dap->da_previous; dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } /* * We are deleting an entry that never made it to disk. * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; free_diradd(dap); return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ ino_t newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mount *mp; offset = blkoff(dp->i_fs, dp->i_offset); mp = UFSTOVFS(dp->i_ump); /* * Whiteouts do not need diradd dependencies. */ if (newinum != WINO) { MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); return; } /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to the previous inode until * the new inode is committed to disk. If the COMPLETE flag is * set, then we have deleted an entry that never made it to disk. * If the entry we deleted resulted from a name change, then the old * inode reference still resides on disk. Any rollback that we do * needs to be to that old inode (returned to us in prevdirrem). If * the entry we deleted resulted from a create, then there is * no entry on the disk, so we want to roll back to zero rather * than the uncommitted inode. In either of the COMPLETE cases we * want to immediately free the unwritten and unreferenced inode. */ if ((dirrem->dm_state & COMPLETE) == 0) { dap->da_previous = dirrem; } else { if (prevdirrem != NULL) { dap->da_previous = prevdirrem; } else { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } FREE_LOCK(&lk); } /* * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; ACQUIRE_LOCK(&lk); (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(&lk); } /* * Called when the effective link count and the reference count * on an inode drops to zero. At this point there are no names * referencing the file in the filesystem and no active file * references. The space associated with the file will be freed * as soon as the necessary soft dependencies are cleared. */ void softdep_releasefile(ip) struct inode *ip; /* inode with the zero effective link count */ { struct inodedep *inodedep; struct fs *fs; int extblocks; if (ip->i_effnlink > 0) panic("softdep_releasefile: file still referenced"); /* * We may be called several times as the on-disk link count * drops to zero. We only want to account for the space once. */ if (ip->i_flag & IN_SPACECOUNTED) return; /* * We have to deactivate a snapshot otherwise copyonwrites may * add blocks and the cleanup may remove blocks after we have * tried to account for them. */ if ((ip->i_flags & SF_SNAPSHOT) != 0) ffs_snapremove(ITOV(ip)); /* * If we are tracking an nlinkdelta, we have to also remember * whether we accounted for the freed space yet. */ ACQUIRE_LOCK(&lk); if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep))) inodedep->id_state |= SPACECOUNTED; FREE_LOCK(&lk); fs = ip->i_fs; extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks; ip->i_fs->fs_pendinginodes += 1; UFS_UNLOCK(ip->i_ump); ip->i_flag |= IN_SPACECOUNTED; } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static void handle_workitem_remove(dirrem, xp) struct dirrem *dirrem; struct vnode *xp; { struct thread *td = curthread; struct inodedep *inodedep; struct vnode *vp; struct inode *ip; ino_t oldinum; int error; if ((vp = xp) == NULL && (error = ffs_vget(dirrem->dm_list.wk_mp, dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) { softdep_error("handle_workitem_remove: vget", error); return; } ip = VTOI(vp); ACQUIRE_LOCK(&lk); if ((inodedep_lookup(dirrem->dm_list.wk_mp, dirrem->dm_oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); vput(vp); return; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Next truncate the directory to length zero. When the * truncation completes, arrange to have the reference count on * the parent decremented to account for the loss of "..". */ ip->i_nlink -= 2; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(&lk); if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) softdep_error("handle_workitem_remove: truncate", error); ACQUIRE_LOCK(&lk); /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); vput(vp); return; } /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can remove the file immediately. */ dirrem->dm_state = 0; oldinum = dirrem->dm_oldinum; dirrem->dm_oldinum = dirrem->dm_dirinum; if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum, 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { if (xp != NULL) add_to_worklist(&dirrem->dm_list); FREE_LOCK(&lk); vput(vp); if (xp == NULL) handle_workitem_remove(dirrem, NULL); return; } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); ip->i_flag |= IN_CHANGE; ffs_update(vp, 0); vput(vp); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct fs *fs; struct inodedep *idp; struct ufsmount *ump; int error; ump = VFSTOUFS(freefile->fx_list.wk_mp); fs = ump->um_fs; #ifdef DEBUG ACQUIRE_LOCK(&lk); error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(&lk); if (error) panic("handle_workitem_freefile: inodedep survived"); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefile, D_FREEFILE); FREE_LOCK(&lk); } /* * Helper function which unlinks marker element from work list and returns * the next element on the list. */ static __inline struct worklist * markernext(struct worklist *marker) { struct worklist *next; next = LIST_NEXT(marker, wk_list); LIST_REMOVE(marker, wk_list); return next; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ static void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk; struct worklist marker; struct indirdep *indirdep; struct inodedep *inodedep; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ ACQUIRE_LOCK(&lk); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = markernext(&marker)) { LIST_INSERT_AFTER(wk, &marker, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: inodedep = WK_INODEDEP(wk); if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) initiate_write_inodeblock_ufs1(inodedep, bp); else initiate_write_inodeblock_ufs2(inodedep, bp); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this * will be writing the real pointers, so the * dependency can be freed. */ if (LIST_EMPTY(&indirdep->ir_deplisthd)) { struct buf *bp; bp = indirdep->ir_savebp; bp->b_flags |= B_INVAL | B_NOCACHE; /* inline expand WORKLIST_REMOVE(wk); */ wk->wk_state &= ~ONWORKLIST; LIST_REMOVE(wk, wk_list); WORKITEM_FREE(indirdep, D_INDIRDEP); FREE_LOCK(&lk); brelse(bp); ACQUIRE_LOCK(&lk); continue; } /* * Replace up-to-date version with safe version. */ FREE_LOCK(&lk); MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); continue; case D_MKDIR: case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(&lk); PRELE(curproc); /* Allow swapout of kernel stack */ } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %d != new %d", "initiate_write_filepage", ep->d_ino, dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } } /* * Version of initiate_write_inodeblock that handles UFS1 dinodes. * Note that any bug fixes made to this routine must be done in the * version found below. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs1(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs1: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino1 != NULL) panic("initiate_write_inodeblock_ufs1: I/O underway"); FREE_LOCK(&lk); MALLOC(sip, struct ufs1_dinode *, sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); inodedep->id_savedino1 = sip; *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; if (TAILQ_EMPTY(&inodedep->id_inoupdt)) return; /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (adp->ad_lbn < NDADDR && dp->di_db[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn, dp->di_db[adp->ad_lbn], (intmax_t)adp->ad_newblkno); if (adp->ad_lbn >= NDADDR && dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn - NDADDR, dp->di_ib[adp->ad_lbn - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_lbn >= NDADDR) break; dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_lbn - NDADDR] = 0; } /* * Version of initiate_write_inodeblock that handles UFS2 dinodes. * Note that any bug fixes made to this routine must be done in the * version found above. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs2(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs2: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino2 != NULL) panic("initiate_write_inodeblock_ufs2: I/O underway"); FREE_LOCK(&lk); MALLOC(sip, struct ufs2_dinode *, sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); inodedep->id_savedino2 = sip; *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_extupdt)) return; /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn, (intmax_t)dp->di_extb[adp->ad_lbn], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the ext * data which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } lastadp = NULL; break; } /* * If we have zero'ed out the last allocated block of the ext * data, roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; } /* * Set the file data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (adp->ad_lbn < NDADDR && dp->di_db[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn, (intmax_t)dp->di_db[adp->ad_lbn], (intmax_t)adp->ad_newblkno); if (adp->ad_lbn >= NDADDR && dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", (intmax_t)adp->ad_lbn - NDADDR, (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_lbn >= NDADDR) break; dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_lbn - NDADDR] = 0; } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. */ static void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct worklist *owk; struct workhead reattach; struct newblk *newblk; struct allocindir *aip; struct allocdirect *adp; struct indirdep *indirdep; struct inodedep *inodedep; struct bmsafemap *bmsafemap; /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be unrolled. */ if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) return; LIST_INIT(&reattach); /* * This lock must not be released anywhere in this code segment. */ ACQUIRE_LOCK(&lk); owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); if (wk == owk) panic("duplicate worklist: %p\n", wk); owk = wk; switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); } while ((adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; LIST_REMOVE(adp, ad_deps); handle_allocdirect_partdone(adp); } while ((aip = LIST_FIRST(&bmsafemap->sm_allocindirhd))) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; LIST_REMOVE(aip, ai_deps); handle_allocindir_partdone(aip); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { inodedep->id_state |= DEPCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; } WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); adp->ad_state |= COMPLETE; handle_allocdirect_partdone(adp); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); aip->ai_state |= COMPLETE; handle_allocindir_partdone(aip); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_write_complete: indirdep gone"); bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); FREE(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = 0; indirdep->ir_state &= ~UNDONE; indirdep->ir_state |= ATTACHED; while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); } WORKLIST_INSERT(&reattach, wk); if ((bp->b_flags & B_DELWRI) == 0) stat_indir_blk_ptrs++; bdirty(bp); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(&lk); } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocdirect_partdone(adp) struct allocdirect *adp; /* the completed allocdirect */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize, delay; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (adp->ad_buf != NULL) panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; if (adp->ad_state & EXTDATA) listhead = &inodedep->id_extupdt; else listhead = &inodedep->id_inoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef DEBUG if (adp->ad_state & EXTDATA) listhead = &inodedep->id_newextupdt; else listhead = &inodedep->id_newinoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* DEBUG */ return; } /* * If we have found the just finished dependency, then free * it along with anything that follows it that is complete. * If the inode still has a bitmap dependency, then it has * never been written to disk, hence the on-disk inode cannot * reference the old fragment so we can free it without delay. */ delay = (inodedep->id_state & DEPCOMPLETE); for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; free_allocdirect(listhead, adp, delay); } } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (aip->ai_buf != NULL) panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; if (indirdep->ir_state & UNDONE) { LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; LIST_REMOVE(aip, ai_next); if (aip->ai_freefrag != NULL) add_to_worklist(&aip->ai_freefrag->ff_list); WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * splbio interrupts blocked. */ static int handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { struct worklist *wk, *filefree; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; int hadchanges, fstype; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); } /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else *dp2 = *inodedep->id_savedino2; FREE(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); return (1); } inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ hadchanges = 0; for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { if (adp->ad_lbn < NDADDR) { if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", (intmax_t)adp->ad_lbn, dp1->di_db[adp->ad_lbn], (intmax_t)adp->ad_oldblkno); dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; } else { if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_lbn - NDADDR, dp1->di_ib[adp->ad_lbn - NDADDR]); dp1->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; } } else { if (adp->ad_lbn < NDADDR) { if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", (intmax_t)adp->ad_lbn, "mismatch", (intmax_t)dp2->di_db[adp->ad_lbn], (intmax_t)adp->ad_oldblkno); dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; } else { if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_lbn - NDADDR, (intmax_t) dp2->di_ib[adp->ad_lbn - NDADDR]); dp2->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; } } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", (intmax_t)adp->ad_lbn, "mismatch", (intmax_t)dp2->di_extb[adp->ad_lbn], (intmax_t)adp->ad_oldblkno); dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); if (fstype == UFS1) { if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } if (dp2->di_extsize != inodedep->id_savedextsize) { dp2->di_extsize = inodedep->id_savedextsize; hadchanges = 1; } } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) bdirty(bp); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) handle_allocdirect_partdone(adp); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. */ filefree = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding filefree to the worklist until * all other additions have been made to ensure * that it will be done after all the old blocks * have been freed. */ if (filefree != NULL) panic("handle_written_inodeblock: filefree"); filefree = wk; continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEBLKS: wk->wk_state |= COMPLETE; if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) continue; /* -- fall through -- */ case D_FREEFRAG: case D_DIRREM: add_to_worklist(wk); continue; case D_NEWDIRBLK: free_newdirblk(WK_NEWDIRBLK(wk)); continue; default: panic("handle_written_inodeblock: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } if (filefree != NULL) { if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep"); add_to_worklist(filefree); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && TAILQ_FIRST(&inodedep->id_extupdt) == 0)) return (0); return (hadchanges); } /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { struct pagedep *pagedep; dap->da_state |= COMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { struct diradd *dap; struct pagedep *pagedep; if (mkdir->md_state != type) panic("handle_written_mkdir: bad type"); dap = mkdir->md_diradd; dap->da_state &= ~type; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) dap->da_state |= DEPCOMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further splbio interrupts blocked. */ static int handle_written_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } /* * Free any directory additions that have been committed. * If it is a newly allocated block, we have to wait until * the on-disk directory inode claims the new block. */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap); /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); return (1); } /* * If we are not waiting for a new directory block to be * claimed by its inode, then the pagedep will be freed. * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ if ((pagedep->pd_state & NEWBLOCK) == 0) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } return (0); } /* * Writing back in-core inode structures. * * The filesystem only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(&lk); if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return; } ip->i_effnlink -= inodedep->id_nlinkdelta; if (inodedep->id_state & SPACECOUNTED) ip->i_flag |= IN_SPACECOUNTED; FREE_LOCK(&lk); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct worklist *wk; struct mount *mp; struct buf *ibp; int error; /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ mp = UFSTOVFS(ip->i_ump); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) panic("softdep_update_inodeblock: bad link count"); return; } if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if (waitfor == 0) { FREE_LOCK(&lk); return; } retry: if ((inodedep->id_state & DEPCOMPLETE) != 0) { FREE_LOCK(&lk); return; } ibp = inodedep->id_buf; ibp = getdirtybuf(ibp, &lk, MNT_WAIT); if (ibp == NULL) { /* * If ibp came back as NULL, the dependency could have been * freed while we slept. Look it up again, and check to see * that it has completed. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) goto retry; FREE_LOCK(&lk); return; } FREE_LOCK(&lk); if ((error = bwrite(ibp)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); } /* * Merge the a new inode dependency list (such as id_newinoupdt) into an * old inode dependency list (such as id_inoupdt). This routine must be * called with splbio interrupts blocked. */ static void merge_inode_lists(newlisthead, oldlisthead) struct allocdirectlst *newlisthead; struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_lbn < newadp->ad_lbn) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_lbn == newadp->ad_lbn) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(newlisthead); } while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct pagedep *pagedep; struct worklist *wk; struct diradd *dap; struct mount *mp; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct thread *td = curthread; int error, flushparent, pagedep_new_block; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); fs = ip->i_fs; mp = vp->v_mount; ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return (0); } if (!LIST_EMPTY(&inodedep->id_inowait) || !LIST_EMPTY(&inodedep->id_bufwait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) panic("softdep_fsync: pending ops"); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * Flush our parent if this directory entry has a MKDIR_PARENT * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); if ((dap->da_state & MKDIR_PARENT) || (pagedep->pd_state & NEWBLOCK)) flushparent = 1; else flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (vp->v_iflag & VI_DOOMED) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we first try to get the lock. If * that fails, we must unlock ourselves before requesting * the lock on our parent. See the comment in ufs_lookup * for details on possible races. */ FREE_LOCK(&lk); if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) { VOP_UNLOCK(vp, 0, td); error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) return (error); } /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by * doing a ffs_update. Pagedeps contained in indirect blocks * may require a complete sync'ing of the directory. So, we * try the cheap and fast ffs_update first, and if that fails, * then we do the slower ffs_syncvnode of the directory. */ if (flushparent) { int locked; if ((error = ffs_update(pvp, 1)) != 0) { vput(pvp); return (error); } ACQUIRE_LOCK(&lk); locked = 1; if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; pagedep_new_block = pagedep->pd_state & NEWBLOCK; FREE_LOCK(&lk); locked = 0; if (pagedep_new_block && (error = ffs_syncvnode(pvp, MNT_WAIT))) { vput(pvp); return (error); } } } if (locked) FREE_LOCK(&lk); } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, &bp); if (error == 0) error = bwrite(bp); else brelse(bp); vput(pvp); if (error != 0) return (error); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) break; } FREE_LOCK(&lk); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; if (!vn_isdisk(vp, NULL)) panic("softdep_fsync_mountdev: vnode not a disk"); restart: ACQUIRE_LOCK(&lk); VI_LOCK(vp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); continue; } VI_UNLOCK(vp); FREE_LOCK(&lk); bremfree(bp); (void) bawrite(bp); goto restart; } FREE_LOCK(&lk); drain_output(vp); VI_UNLOCK(vp); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed by pushing the dirty blocks * associated with the file. If any I/O errors occur, they are returned. */ int softdep_sync_metadata(struct vnode *vp) { struct pagedep *pagedep; struct allocdirect *adp; struct allocindir *aip; struct buf *bp, *nbp; struct worklist *wk; int i, error, waitfor; if (!DOINGSOFTDEP(vp)) return (0); /* * Ensure that any direct block dependencies have been cleared. */ ACQUIRE_LOCK(&lk); if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { FREE_LOCK(&lk); return (error); } FREE_LOCK(&lk); /* * For most files, the only metadata dependencies are the * cylinder group maps that allocate their inode or blocks. * The block allocation dependencies can be found by traversing * the dependency lists for any buffers that remain on their * dirty buffer list. The inode allocation dependency will * be resolved when the inode is updated with MNT_WAIT. * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. */ waitfor = MNT_NOWAIT; top: /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. */ VI_LOCK(vp); drain_output(vp); while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) { bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT); if (bp) break; } VI_UNLOCK(vp); if (bp == NULL) return (0); loop: /* While syncing snapshots, we must allow recursive lookups */ bp->b_lock.lk_flags |= LK_CANRECURSE; ACQUIRE_LOCK(&lk); /* * As we hold the buffer locked, none of its dependencies * will disappear. */ LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); if (adp->ad_state & DEPCOMPLETE) continue; nbp = adp->ad_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; case D_INDIRDEP: restart: LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; nbp = getdirtybuf(nbp, &lk, MNT_WAIT); if (nbp == NULL) goto restart; FREE_LOCK(&lk); if ((error = bwrite(nbp)) != 0) { goto loop_end; } ACQUIRE_LOCK(&lk); goto restart; } continue; case D_INODEDEP: if ((error = flush_inodedep_deps(wk->wk_mp, WK_INODEDEP(wk)->id_ino)) != 0) { FREE_LOCK(&lk); break; } continue; case D_PAGEDEP: /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, wk->wk_mp, &pagedep->pd_diraddhd[i]))) { FREE_LOCK(&lk); goto loop_end; } } continue; case D_MKDIR: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_MKDIR(wk)->md_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; case D_BMSAFEMAP: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_BMSAFEMAP(wk)->sm_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; default: panic("softdep_sync_metadata: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } loop_end: /* We reach here only in error and unlocked */ if (error == 0) panic("softdep_sync_metadata: zero error"); bp->b_lock.lk_flags &= ~LK_CANRECURSE; bawrite(bp); return (error); } FREE_LOCK(&lk); VI_LOCK(vp); while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT); if (nbp) break; } VI_UNLOCK(vp); bp->b_lock.lk_flags &= ~LK_CANRECURSE; bawrite(bp); if (nbp != NULL) { bp = nbp; goto loop; } /* * The brief unlock is to allow any pent up dependency * processing to be done. Then proceed with the second pass. */ if (waitfor == MNT_NOWAIT) { waitfor = MNT_WAIT; goto top; } /* * If we have managed to get rid of all the dirty buffers, * then we are done. For certain directories and block * devices, we may need to do further work. * * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. */ VI_LOCK(vp); drain_output(vp); VI_UNLOCK(vp); return (0); } /* * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ static int flush_inodedep_deps(mp, ino) struct mount *mp; ino_t ino; { struct inodedep *inodedep; int error, waitfor; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ for (error = 0, waitfor = MNT_NOWAIT; ; ) { if (error) return (error); FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Flush an inode dependency list. * Called with splbio blocked. */ static int flush_deplist(listhead, waitfor, errorp) struct allocdirectlst *listhead; int waitfor; int *errorp; { struct allocdirect *adp; struct buf *bp; mtx_assert(&lk, MA_OWNED); TAILQ_FOREACH(adp, listhead, ad_next) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; bp = getdirtybuf(bp, &lk, waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) continue; return (1); } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); } else if ((*errorp = bwrite(bp)) != 0) { ACQUIRE_LOCK(&lk); return (1); } ACQUIRE_LOCK(&lk); return (1); } return (0); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct inodedep *inodedep; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int error = 0; struct buf *bp; ino_t inum; struct worklist *wk; ump = VFSTOUFS(mp); while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(&lk); if ((error = ffs_update(pvp, 1)) != 0) break; ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_PARENT) panic("flush_pagedep_deps: MKDIR_PARENT"); } /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. We do not want or need * the full semantics of a synchronous ffs_syncvnode as * that may end up here again, once for each directory * level in the filesystem. Instead, we push the blocks * and wait for them to clear. We have to fsync twice * because the first call may choose to defer blocks * that still have dependencies, but deferral will * happen at most once. */ inum = dap->da_newinum; if (dap->da_state & MKDIR_BODY) { FREE_LOCK(&lk); if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp))) break; if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) || (error=ffs_syncvnode(vp, MNT_NOWAIT))) { vput(vp); break; } VI_LOCK(vp); drain_output(vp); /* * If first block is still dirty with a D_MKDIR * dependency then it needs to be written now. */ for (;;) { error = 0; bp = gbincore(&vp->v_bufobj, 0); if (bp == NULL) break; /* First block not present */ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)); VI_LOCK(vp); if (error == ENOLCK) continue; /* Slept, retry */ if (error != 0) break; /* Failed */ if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); break; /* Buffer not dirty */ } for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = LIST_NEXT(wk, wk_list)) if (wk->wk_type == D_MKDIR) break; if (wk == NULL) BUF_UNLOCK(bp); /* Dependency gone */ else { /* * D_MKDIR dependency remains, * must write buffer to stable * storage. */ VI_UNLOCK(vp); bremfree(bp); error = bwrite(bp); VI_LOCK(vp); } break; } VI_UNLOCK(vp); vput(vp); if (error != 0) break; /* Flushing of first block failed */ ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_BODY) panic("flush_pagedep_deps: MKDIR_BODY"); } /* * Flush the inode on which the directory entry depends. * Having accounted for MKDIR_PARENT and MKDIR_BODY above, * the only remaining dependency is that the updated inode * count must get pushed to disk. The inode has already * been pushed into its inode buffer (via VOP_UPDATE) at * the time of the reference count change. So we need only * locate that buffer, ensure that there will be no rollback * caused by a bitmap dependency, then write the inode buffer. */ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { bp = inodedep->id_buf; bp = getdirtybuf(bp, &lk, MNT_WAIT); if (bp == NULL) goto retry; FREE_LOCK(&lk); if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(&lk); if (dap != LIST_FIRST(diraddhdp)) continue; } /* * If the inode is still sitting in a buffer waiting * to be written, push it to disk. */ FREE_LOCK(&lk); if ((error = bread(ump->um_devvp, fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); break; } if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(&lk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush failed"); } if (error) ACQUIRE_LOCK(&lk); return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. First attempt to slow things down * using the techniques below. If that fails, this routine requests * the offending operations to fall back to running synchronously * until the memory load returns to a reasonable level. */ int softdep_slowdown(vp) struct vnode *vp; { int max_softdeps_hard; ACQUIRE_LOCK(&lk); max_softdeps_hard = max_softdeps * 11 / 10; if (num_dirrem < max_softdeps_hard / 2 && num_inodedep < max_softdeps_hard && VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) { FREE_LOCK(&lk); return (0); } if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps) softdep_speedup(); stat_sync_limit_hit += 1; FREE_LOCK(&lk); return (1); } /* * Called by the allocation routines when they are about to fail * in the hope that we can free up some disk space. * * First check to see if the work list has anything on it. If it has, * clean up entries until we successfully free some space. Because this * process holds inodes locked, we cannot handle any remove requests * that might block on a locked inode as that could lead to deadlock. * If the worklist yields no free space, encourage the syncer daemon * to help us. In no event will we try for longer than tickdelay seconds. */ int softdep_request_cleanup(fs, vp) struct fs *fs; struct vnode *vp; { struct ufsmount *ump; long starttime; ufs2_daddr_t needed; int error; ump = VTOI(vp)->i_ump; mtx_assert(UFS_MTX(ump), MA_OWNED); needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; starttime = time_second + tickdelay; /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to update the vnode * as we may recurse into the copy-on-write routine. */ if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { UFS_UNLOCK(ump); error = ffs_update(vp, 1); UFS_LOCK(ump); if (error != 0) return (0); } while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { if (time_second > starttime) return (0); UFS_UNLOCK(ump); ACQUIRE_LOCK(&lk); if (ump->softdep_on_worklist > 0 && process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { stat_worklist_push += 1; FREE_LOCK(&lk); UFS_LOCK(ump); continue; } request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); FREE_LOCK(&lk); UFS_LOCK(ump); } return (1); } /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ extern struct thread *syncertd; static int request_cleanup(mp, resource) struct mount *mp; int resource; { struct thread *td = curthread; struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); /* * We never hold up the filesystem syncer or buf daemon. */ if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) return (0); ump = VFSTOUFS(mp); /* * First check to see if the work list has gotten backlogged. * If it has, co-opt this process to help clean up two entries. * Because this process may hold inodes locked, we cannot * handle any remove requests that might block on a locked * inode as that could lead to deadlock. We set TDP_SOFTDEP * to avoid recursively processing the worklist. */ if (ump->softdep_on_worklist > max_softdeps / 10) { td->td_pflags |= TDP_SOFTDEP; process_worklist_item(mp, LK_NOWAIT); process_worklist_item(mp, LK_NOWAIT); td->td_pflags &= ~TDP_SOFTDEP; stat_worklist_push += 2; return(1); } /* * Next, we attempt to speed up the syncer process. If that * is successful, then we allow the process to continue. */ if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) return(0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: stat_ino_limit_push += 1; req_clear_inodedeps += 1; stat_countp = &stat_ino_limit_hit; break; case FLUSH_REMOVE: case FLUSH_REMOVE_WAIT: stat_blk_limit_push += 1; req_clear_remove += 1; stat_countp = &stat_blk_limit_hit; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ proc_waiting += 1; if (handle.callout == NULL) handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. */ static void pause_timer(arg) void *arg; { ACQUIRE_LOCK(&lk); *stat_countp += 1; wakeup_one(&proc_waiting); if (proc_waiting > 0) handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); else handle.callout = NULL; FREE_LOCK(&lk); } /* * Flush out a directory with at least one removal dependency in an effort to * reduce the number of dirrem, freefile, and freeblks dependency structures. */ static void clear_remove(td) struct thread *td; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; static int next = 0; struct mount *mp; struct vnode *vp; int error, cnt; ino_t ino; mtx_assert(&lk, MA_OWNED); for (cnt = 0; cnt < pagedep_hash; cnt++) { pagedephd = &pagedep_hashtbl[next++]; if (next >= pagedep_hash) next = 0; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (LIST_EMPTY(&pagedep->pd_dirremhd)) continue; mp = pagedep->pd_list.wk_mp; ino = pagedep->pd_ino; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(&lk); if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) { softdep_error("clear_remove: vget", error); vn_finished_write(mp); ACQUIRE_LOCK(&lk); return; } if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) softdep_error("clear_remove: fsync", error); VI_LOCK(vp); drain_output(vp); VI_UNLOCK(vp); vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(&lk); return; } } } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(td) struct thread *td; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; static int next = 0; struct mount *mp; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; mtx_assert(&lk, MA_OWNED); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt < inodedep_hash; cnt++) { inodedephd = &inodedep_hashtbl[next++]; if (next >= inodedep_hash) next = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } if (inodedep == NULL) return; fs = inodedep->id_fs; mp = inodedep->id_list.wk_mp; /* * Find the last inode in the block with dependencies. */ firstino = inodedep->id_ino & ~(INOPB(fs) - 1); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) continue; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(&lk); if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) { softdep_error("clear_inodedeps: vget", error); vn_finished_write(mp); ACQUIRE_LOCK(&lk); return; } if (ino == lastino) { if ((error = ffs_syncvnode(vp, MNT_WAIT))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) softdep_error("clear_inodedeps: fsync2", error); VI_LOCK(vp); drain_output(vp); VI_UNLOCK(vp); } vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(&lk); } } /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount * is set, return number of dependencies, otherwise just yes or no. */ static int softdep_count_dependencies(bp, wantcount) struct buf *bp; int wantcount; { struct worklist *wk; struct inodedep *inodedep; struct indirdep *indirdep; struct allocindir *aip; struct pagedep *pagedep; struct diradd *dap; int i, retval; retval = 0; ACQUIRE_LOCK(&lk); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_INODEDEP: inodedep = WK_INODEDEP(wk); if ((inodedep->id_state & DEPCOMPLETE) == 0) { /* bitmap allocation dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_extupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { /* indirect block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { /* directory entry dependency */ retval += 1; if (!wantcount) goto out; } } continue; case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: case D_MKDIR: /* never a dependency on these blocks */ continue; default: panic("softdep_check_for_rollback: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out: FREE_LOCK(&lk); return retval; } /* * Acquire exclusive access to a buffer. * Must be called with a locked mtx parameter. * Return acquired buffer or NULL on failure. */ static struct buf * getdirtybuf(bp, mtx, waitfor) struct buf *bp; struct mtx *mtx; int waitfor; { int error; mtx_assert(mtx, MA_OWNED); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { if (waitfor != MNT_WAIT) return (NULL); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); /* * Even if we sucessfully acquire bp here, we have dropped * mtx, which may violates our guarantee. */ if (error == 0) BUF_UNLOCK(bp); else if (error != ENOLCK) panic("getdirtybuf: inconsistent lock: %d", error); mtx_lock(mtx); return (NULL); } if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { if (mtx == &lk && waitfor == MNT_WAIT) { mtx_unlock(mtx); BO_LOCK(bp->b_bufobj); BUF_UNLOCK(bp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO | PDROP, "getbuf", 0); } else BO_UNLOCK(bp->b_bufobj); mtx_lock(mtx); return (NULL); } BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); /* * The mtx argument must be bp->b_vp's mutex in * this case. */ #ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf"); #endif bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); return (NULL); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (NULL); } bremfree(bp); return (bp); } /* * Check if it is safe to suspend the file system now. On entry, * the vnode interlock for devvp should be held. Return 0 with * the mount interlock held if the file system can be suspended now, * otherwise return EAGAIN with the mount interlock held. */ int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_deps, int softdep_accdeps, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; struct ufsmount *ump; int error; ASSERT_VI_LOCKED(devvp, "softdep_check_suspend"); ump = VFSTOUFS(mp); bo = &devvp->v_bufobj; for (;;) { if (!TRY_ACQUIRE_LOCK(&lk)) { VI_UNLOCK(devvp); ACQUIRE_LOCK(&lk); FREE_LOCK(&lk); VI_LOCK(devvp); continue; } if (!MNT_ITRYLOCK(mp)) { FREE_LOCK(&lk); VI_UNLOCK(devvp); MNT_ILOCK(mp); MNT_IUNLOCK(mp); VI_LOCK(devvp); continue; } if (mp->mnt_secondary_writes != 0) { FREE_LOCK(&lk); VI_UNLOCK(devvp); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); VI_LOCK(devvp); continue; } break; } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Softdep activity occurred after start of vnode sync loop * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || softdep_deps != 0 || ump->softdep_deps != 0 || softdep_accdeps != ump->softdep_accdeps || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; FREE_LOCK(&lk); VI_UNLOCK(devvp); return (error); } /* * Get the number of dependency structures for the file system, both * the current number and the total number allocated. These will * later be used to detect that softdep processing has occurred. */ void softdep_get_depcounts(struct mount *mp, int *softdep_depsp, int *softdep_accdepsp) { struct ufsmount *ump; ump = VFSTOUFS(mp); ACQUIRE_LOCK(&lk); *softdep_depsp = ump->softdep_deps; *softdep_accdepsp = ump->softdep_accdeps; FREE_LOCK(&lk); } /* * Wait for pending output on a vnode to complete. * Must be called with vnode lock and interlock locked. * * XXX: Should just be a call to bufobj_wwait(). */ static void drain_output(vp) struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "drain_output"); ASSERT_VI_LOCKED(vp, "drain_output"); while (vp->v_bufobj.bo_numoutput) { vp->v_bufobj.bo_flag |= BO_WWAIT; msleep((caddr_t)&vp->v_bufobj.bo_numoutput, VI_MTX(vp), PRIBIO + 1, "drainvp", 0); } } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ static void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_ioflags & BIO_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ static void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } #endif /* SOFTUPDATES */ Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 175201) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 175202) @@ -1,1864 +1,1864 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_reload(struct mount *, struct thread *); static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr", "clusterw", "exec", "export", "force", "from", "multilabel", "snapshot", "suid", "suiddir", "symfollow", "sync", "union", NULL }; static int ffs_mount(struct mount *mp, struct thread *td) { struct vnode *devvp; struct ufsmount *ump = 0; struct fs *fs; int error, flags; u_int mntorflags, mntandnotflags; mode_t accessmode; struct nameidata ndp; char *fspec; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; mntandnotflags = 0; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0) mntorflags |= MNT_ASYNC; if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0) mntorflags |= MNT_FORCE; if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0) mntorflags |= MNT_MULTILABEL; if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0) mntandnotflags |= MNT_ASYNC; if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0) mntorflags |= MNT_NOATIME; if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0) mntorflags |= MNT_NOCLUSTERR; if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0) mntorflags |= MNT_NOCLUSTERW; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) mntorflags |= MNT_SNAPSHOT; MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); /* * Flush any dirty data. */ if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) { vn_finished_write(mp); return (error); } /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vn_finished_write(mp); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: %s: blocks %jd files %d\n", fs->fs_fsmnt, "update error", (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vn_finished_write(mp); return (error); } vn_finished_write(mp); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not %s\n", fs->fs_fsmnt, "properly dismounted"); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } DROP_GIANT(); g_topology_lock(); /* * If we're the root device, we may not have an E count * yet, get it now. */ if (ump->um_cp->ace == 0) error = g_access(ump->um_cp, 0, 1, 1); else error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); return (error); } /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (mp->mnt_flag & MNT_SOFTDEP) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp)) != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ffs_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, size, error; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); ump = VFSTOUFS(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, td, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0, td); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; /* The file system is still read-only. */ newfs->fs_ronly = 1; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: reload pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } loop: MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, td, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_VNODE_FOREACH_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; dev = devvp->v_rdev; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); /* * If we are a root mount, drop the E flag so fsck can do its magic. * We will pick it up again when we remount R/W. */ if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS)) error = g_access(cp, 0, 0, -1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; devvp->v_bufobj.bo_private = cp; devvp->v_bufobj.bo_ops = &ffs_ops; bp = NULL; ump = NULL; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: mount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ size = 1024; mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &size, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf( "WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf( "WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf( "WARNING: %s: multilabel flag on fs but no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf( "WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem stat information in mount struct. */ #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; } return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags; flags = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("ffs_unmount: ufs_extattr_stop returned %d\n", error); } else { ufs_extattr_uepm_destroy(&ump->um_extattr); } #endif if (mp->mnt_flag & MNT_SOFTDEP) { if ((error = softdep_flushfiles(mp, flags, td)) != 0) return (error); } else { if ((error = ffs_flushfiles(mp, flags, td)) != 0) return (error); } fs = ump->um_fs; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: unmount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error) { fs->fs_clean = 0; return (error); } } DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { quotaoff(td, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Flush all the files. */ if ((error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ - vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0, td); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = NAME_MAX; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ffs_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { struct vnode *mvp, *vp, *devvp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, wait, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } /* * Write back each (modified) inode. */ wait = 0; suspend = 0; suspended = 0; lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) { wait = 1; lockreq = LK_EXCLUSIVE; } lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; MNT_ILOCK(mp); loop: /* Grab snapshot of secondary write counts */ secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; /* Grab snapshot of softdep dependency counts */ MNT_IUNLOCK(mp); softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { /* * Depend on the mntvnode_slock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } continue; } if ((error = ffs_syncvnode(vp, waitfor)) != 0) allerror = error; vput(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) { MNT_ILOCK(mp); goto loop; } } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; VI_LOCK(devvp); bo = &devvp->v_bufobj; if (waitfor != MNT_LAZY && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK); if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(devvp, 0, td); if (allerror == 0 && waitfor == MNT_WAIT) { MNT_ILOCK(mp); goto loop; } } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) goto loop; /* More work needed */ mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else VI_UNLOCK(devvp); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int error; struct thread *td; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); dev = ump->um_dev; fs = ump->um_fs; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ffs_sync() if a sync happens to fire right then, * which will cause a panic because ffs_sync() blindly * dereferences vp->v_data (as well it should). */ ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if (fs->fs_magic == FS_UFS1_MAGIC) error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp); else error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive and shared locking. */ vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_fs = fs; ip->i_dev = dev; ip->i_number = ino; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif td = curthread; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td); error = insmntque(vp, mp); if (error != 0) { uma_zfree(uma_inode, ip); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (ip->i_ump->um_fstype == UFS1) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if (ip->i_ump->um_fstype == UFS1) error = ufs_vinit(mp, &ffs_fifoops1, &vp); else error = ufs_vinit(mp, &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = arc4random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din1->di_ouid; /* XXX */ ip->i_gid = ip->i_din1->di_ogid; /* XXX */ } /* XXX */ #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_fhtovp(mp, ufhp, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(mp, waitfor, suspended) struct ufsmount *mp; int waitfor; int suspended; { struct fs *fs = mp->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* Grab an extra reference to be dropped by the bufdone() below. */ bufobj_wrefl(bufobj); BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { int oldflags, s; struct buf *newbp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); splx(s); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } BO_UNLOCK(bp->b_bufobj); /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize); /* * set it to be identical to the old block. We have to * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); BO_UNLOCK(bp->b_bufobj); newbp->b_bufobj = &bp->b_vp->v_bufobj; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES /* move over the dependencies */ if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, newbp); #endif /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bqrelse(bp); bp = newbp; } /* Let the normal bufwrite do the rest for us */ return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int error; struct buf *tbp; vp = bo->__bo_vnode; if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); bp->b_flags &= ~B_VALIDSUSPWRT; if ((vp->v_vflag & VV_COPYONWRITE) && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif } g_vfs_strategy(bo, bp); } Index: head/sys/ufs/ufs/ufs_extattr.c =================================================================== --- head/sys/ufs/ufs/ufs_extattr.c (revision 175201) +++ head/sys/ufs/ufs/ufs_extattr.c (revision 175202) @@ -1,1277 +1,1273 @@ /*- * Copyright (c) 1999-2002 Robert N. M. Watson * Copyright (c) 2002-2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Support for filesystem extended attribute: UFS-specific support functions. */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_EXTATTR static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute"); static int ufs_extattr_sync = 0; SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync, 0, ""); static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname); static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td); static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct thread *td); static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct thread *td); static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, struct ucred *cred, struct thread *td); static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, struct ucred *cred, struct thread *td); static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, struct ucred *cred, struct thread *td); /* * Per-FS attribute lock protecting attribute operations. * XXX Right now there is a lot of lock contention due to having a single * lock per-FS; really, this should be far more fine-grained. */ static void ufs_extattr_uepm_lock(struct ufsmount *ump, struct thread *td) { /* Ideally, LK_CANRECURSE would not be used, here. */ lockmgr(&ump->um_extattr.uepm_lock, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE, 0, td); } static void ufs_extattr_uepm_unlock(struct ufsmount *ump, struct thread *td) { lockmgr(&ump->um_extattr.uepm_lock, LK_RELEASE, 0, td); } /*- * Determine whether the name passed is a valid name for an actual * attribute. * * Invalid currently consists of: * NULL pointer for attrname * zero-length attrname (used to retrieve application attribute list) */ static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) { if (attrname == NULL) return (0); if (strlen(attrname) == 0) return (0); return (1); } /* * Locate an attribute given a name and mountpoint. * Must be holding uepm lock for the mount point. */ static struct ufs_extattr_list_entry * ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, const char *attrname) { struct ufs_extattr_list_entry *search_attribute; for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); search_attribute != NULL; search_attribute = LIST_NEXT(search_attribute, uele_entries)) { if (!(strncmp(attrname, search_attribute->uele_attrname, UFS_EXTATTR_MAXEXTATTRNAME)) && (attrnamespace == search_attribute->uele_attrnamespace)) { return (search_attribute); } } return (0); } /* * Initialize per-FS structures supporting extended attributes. Do not * start extended attributes yet. */ void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) { uepm->uepm_flags = 0; LIST_INIT(&uepm->uepm_list); /* XXX is PVFS right, here? */ lockinit(&uepm->uepm_lock, PVFS, "extattr", 0, 0); uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; } /* * Destroy per-FS structures supporting extended attributes. Assumes * that EAs have already been stopped, and will panic if not. */ void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) { if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) panic("ufs_extattr_uepm_destroy: not initialized"); if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) panic("ufs_extattr_uepm_destroy: called while still started"); /* * It's not clear that either order for the next two lines is * ideal, and it should never be a problem if this is only called * during unmount, and with vfs_busy(). */ uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; lockdestroy(&uepm->uepm_lock); } /* * Start extended attribute support on an FS. */ int ufs_extattr_start(struct mount *mp, struct thread *td) { struct ufsmount *ump; int error = 0; ump = VFSTOUFS(mp); ufs_extattr_uepm_lock(ump, td); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) { error = EOPNOTSUPP; goto unlock; } if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) { error = EBUSY; goto unlock; } ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; ump->um_extattr.uepm_ucred = crhold(td->td_ucred); unlock: ufs_extattr_uepm_unlock(ump, td); return (error); } #ifdef UFS_EXTATTR_AUTOSTART /* * Helper routine: given a locked parent directory and filename, return * the locked vnode of the inode associated with the name. Will not * follow symlinks, may return any type of vnode. Lock on parent will * be released even in the event of a failure. In the event that the * target is the parent (i.e., "."), there will be two references and * one lock, requiring the caller to possibly special-case. */ #define UE_GETDIR_LOCKPARENT 1 #define UE_GETDIR_LOCKPARENT_DONT 2 static int ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname, struct vnode **vp, struct thread *td) { struct vop_cachedlookup_args vargs; struct componentname cnp; struct vnode *target_vp; int error; bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; if (lockparent == UE_GETDIR_LOCKPARENT) cnp.cn_flags |= LOCKPARENT; cnp.cn_lkflags = LK_EXCLUSIVE; cnp.cn_thread = td; cnp.cn_cred = td->td_ucred; cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); cnp.cn_nameptr = cnp.cn_pnbuf; error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN, (size_t *) &cnp.cn_namelen); if (error) { if (lockparent == UE_GETDIR_LOCKPARENT_DONT) { VOP_UNLOCK(start_dvp, 0, td); } uma_zfree(namei_zone, cnp.cn_pnbuf); printf("ufs_extattr_lookup: copystr failed\n"); return (error); } cnp.cn_namelen--; /* trim nul termination */ vargs.a_gen.a_desc = NULL; vargs.a_dvp = start_dvp; vargs.a_vpp = &target_vp; vargs.a_cnp = &cnp; error = ufs_lookup(&vargs); uma_zfree(namei_zone, cnp.cn_pnbuf); if (error) { /* * Error condition, may have to release the lock on the parent * if ufs_lookup() didn't. */ if (lockparent == UE_GETDIR_LOCKPARENT_DONT) VOP_UNLOCK(start_dvp, 0, td); /* * Check that ufs_lookup() didn't release the lock when we * didn't want it to. */ if (lockparent == UE_GETDIR_LOCKPARENT) ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); return (error); } /* if (target_vp == start_dvp) panic("ufs_extattr_lookup: target_vp == start_dvp"); */ if (target_vp != start_dvp && lockparent == UE_GETDIR_LOCKPARENT_DONT) VOP_UNLOCK(start_dvp, 0, td); if (lockparent == UE_GETDIR_LOCKPARENT) ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); /* printf("ufs_extattr_lookup: success\n"); */ *vp = target_vp; return (0); } #endif /* !UFS_EXTATTR_AUTOSTART */ /* * Enable an EA using the passed filesystem, backing vnode, attribute name, * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp * to be locked when passed in. The vnode will be returned unlocked, * regardless of success/failure of the function. As a result, the caller * will always need to vrele(), but not vput(). */ static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { int error; error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL); if (error) { printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed " "with %d\n", error); VOP_UNLOCK(vp, 0, td); return (error); } vp->v_writecount++; vref(vp); VOP_UNLOCK(vp, 0, td); error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td); if (error != 0) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); return (error); } #ifdef UFS_EXTATTR_AUTOSTART /* * Given a locked directory vnode, iterate over the names in the directory * and use ufs_extattr_lookup() to retrieve locked vnodes of potential * attribute files. Then invoke ufs_extattr_enable_with_open() on each * to attempt to start the attribute. Leaves the directory locked on * exit. */ static int ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, int attrnamespace, struct thread *td) { struct vop_readdir_args vargs; struct dirent *dp, *edp; struct vnode *attr_vp; struct uio auio; struct iovec aiov; char *dirbuf; int error, eofflag = 0; if (dvp->v_type != VDIR) return (ENOTDIR); MALLOC(dirbuf, char *, DIRBLKSIZ, M_TEMP, M_WAITOK); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; vargs.a_gen.a_desc = NULL; vargs.a_vp = dvp; vargs.a_uio = &auio; vargs.a_cred = td->td_ucred; vargs.a_eofflag = &eofflag; vargs.a_ncookies = NULL; vargs.a_cookies = NULL; while (!eofflag) { auio.uio_resid = DIRBLKSIZ; aiov.iov_base = dirbuf; aiov.iov_len = DIRBLKSIZ; error = ufs_readdir(&vargs); if (error) { printf("ufs_extattr_iterate_directory: ufs_readdir " "%d\n", error); return (error); } /* * XXXRW: While in UFS, we always get DIRBLKSIZ returns from * the directory code on success, on other file systems this * may not be the case. For portability, we should check the * read length on return from ufs_readdir(). */ edp = (struct dirent *)&dirbuf[DIRBLKSIZ]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { #if (BYTE_ORDER == LITTLE_ENDIAN) dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else dp->d_type = 0; #endif if (dp->d_reclen == 0) break; error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT, dp->d_name, &attr_vp, td); if (error) { printf("ufs_extattr_iterate_directory: lookup " "%s %d\n", dp->d_name, error); } else if (attr_vp == dvp) { vrele(attr_vp); } else if (attr_vp->v_type != VREG) { vput(attr_vp); } else { error = ufs_extattr_enable_with_open(ump, attr_vp, attrnamespace, dp->d_name, td); vrele(attr_vp); if (error) { printf("ufs_extattr_iterate_directory: " "enable %s %d\n", dp->d_name, error); } else if (bootverbose) { printf("UFS autostarted EA %s\n", dp->d_name); } } dp = (struct dirent *) ((char *)dp + dp->d_reclen); if (dp >= edp) break; } } FREE(dirbuf, M_TEMP); return (0); } /* * Auto-start of extended attributes, to be executed (optionally) at * mount-time. */ int ufs_extattr_autostart(struct mount *mp, struct thread *td) { struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp; int error; /* * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? * If so, automatically start EA's. */ error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp, td); if (error) { printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n", error); return (error); } error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT, UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td); if (error) { /* rvp ref'd but now unlocked */ vrele(rvp); return (error); } if (rvp == attr_dvp) { /* Should never happen. */ vput(rvp); vrele(attr_dvp); return (EINVAL); } vrele(rvp); if (attr_dvp->v_type != VDIR) { printf("ufs_extattr_autostart: %s != VDIR\n", UFS_EXTATTR_FSROOTSUBDIR); goto return_vput_attr_dvp; } error = ufs_extattr_start(mp, td); if (error) { printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n", error); goto return_vput_attr_dvp; } /* * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, * and start with appropriate type. Failures in either don't * result in an over-all failure. attr_dvp is left locked to * be cleaned up on exit. */ error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td); if (!error) { error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td); if (error) printf("ufs_extattr_iterate_directory returned %d\n", error); vput(attr_system_dvp); } error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td); if (!error) { error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_user_dvp, EXTATTR_NAMESPACE_USER, td); if (error) printf("ufs_extattr_iterate_directory returned %d\n", error); vput(attr_user_dvp); } /* Mask startup failures in sub-directories. */ error = 0; return_vput_attr_dvp: vput(attr_dvp); return (error); } #endif /* !UFS_EXTATTR_AUTOSTART */ /* * Stop extended attribute support on an FS. */ int ufs_extattr_stop(struct mount *mp, struct thread *td) { struct ufs_extattr_list_entry *uele; struct ufsmount *ump = VFSTOUFS(mp); int error = 0; ufs_extattr_uepm_lock(ump, td); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto unlock; } while ((uele = LIST_FIRST(&ump->um_extattr.uepm_list)) != NULL) { ufs_extattr_disable(ump, uele->uele_attrnamespace, uele->uele_attrname, td); } ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; crfree(ump->um_extattr.uepm_ucred); ump->um_extattr.uepm_ucred = NULL; unlock: ufs_extattr_uepm_unlock(ump, td); return (error); } /* * Enable a named attribute on the specified filesystem; provide an * unlocked backing vnode to hold the attribute data. */ static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct iovec aiov; struct uio auio; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return (EINVAL); if (backing_vnode->v_type != VREG) return (EINVAL); MALLOC(attribute, struct ufs_extattr_list_entry *, sizeof(struct ufs_extattr_list_entry), M_UFS_EXTATTR, M_WAITOK); if (attribute == NULL) return (ENOMEM); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto free_exit; } if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { error = EEXIST; goto free_exit; } strncpy(attribute->uele_attrname, attrname, UFS_EXTATTR_MAXEXTATTRNAME); attribute->uele_attrnamespace = attrnamespace; bzero(&attribute->uele_fileheader, sizeof(struct ufs_extattr_fileheader)); attribute->uele_backing_vnode = backing_vnode; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (caddr_t) &attribute->uele_fileheader; aiov.iov_len = sizeof(struct ufs_extattr_fileheader); auio.uio_resid = sizeof(struct ufs_extattr_fileheader); auio.uio_offset = (off_t) 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; - vn_lock(backing_vnode, LK_SHARED | LK_RETRY, td); + vn_lock(backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto unlock_free_exit; if (auio.uio_resid != 0) { printf("ufs_extattr_enable: malformed attribute header\n"); error = EINVAL; goto unlock_free_exit; } if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { printf("ufs_extattr_enable: invalid attribute header magic\n"); error = EINVAL; goto unlock_free_exit; } if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { printf("ufs_extattr_enable: incorrect attribute header " "version\n"); error = EINVAL; goto unlock_free_exit; } ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable"); LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries); VOP_UNLOCK(backing_vnode, 0, td); return (0); unlock_free_exit: VOP_UNLOCK(backing_vnode, 0, td); free_exit: FREE(attribute, M_UFS_EXTATTR); return (error); } /* * Disable extended attribute support on an FS. */ static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct thread *td) { struct ufs_extattr_list_entry *uele; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return (EINVAL); uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); if (!uele) return (ENOATTR); LIST_REMOVE(uele, uele_entries); - vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY, - td); + vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable"); VOP_UNLOCK(uele->uele_backing_vnode, 0, td); error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, td->td_ucred, td); FREE(uele, M_UFS_EXTATTR); return (error); } /* * VFS call to manage extended attributes in UFS. If filename_vp is * non-NULL, it must be passed in locked, and regardless of errors in * processing, will be unlocked. */ int ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td) { struct ufsmount *ump = VFSTOUFS(mp); int error; /* * Processes with privilege, but in jail, are not allowed to * configure extended attributes. */ error = priv_check(td, PRIV_UFS_EXTATTRCTL); if (error) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0, td); return (error); } switch(cmd) { case UFS_EXTATTR_CMD_START: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0, td); return (EINVAL); } if (attrname != NULL) return (EINVAL); error = ufs_extattr_start(mp, td); return (error); case UFS_EXTATTR_CMD_STOP: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0, td); return (EINVAL); } if (attrname != NULL) return (EINVAL); error = ufs_extattr_stop(mp, td); return (error); case UFS_EXTATTR_CMD_ENABLE: if (filename_vp == NULL) return (EINVAL); if (attrname == NULL) { VOP_UNLOCK(filename_vp, 0, td); return (EINVAL); } /* * ufs_extattr_enable_with_open() will always unlock the * vnode, regardless of failure. */ ufs_extattr_uepm_lock(ump, td); error = ufs_extattr_enable_with_open(ump, filename_vp, attrnamespace, attrname, td); ufs_extattr_uepm_unlock(ump, td); return (error); case UFS_EXTATTR_CMD_DISABLE: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0, td); return (EINVAL); } if (attrname == NULL) return (EINVAL); ufs_extattr_uepm_lock(ump, td); error = ufs_extattr_disable(ump, attrnamespace, attrname, td); ufs_extattr_uepm_unlock(ump, td); return (error); default: return (EINVAL); } } /* * Vnode operating to retrieve a named extended attribute. */ int ufs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump, ap->a_td); error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump, ap->a_td); return (error); } /* * Real work associated with retrieving a named attribute--assumes that * the attribute lock has already been grabbed. */ static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; size_t len, old_len; int error = 0; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (strlen(name) == 0) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, IREAD); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Allow only offsets of zero to encourage the read/replace * extended attribute semantic. Otherwise we can't guarantee * atomicity, as we don't provide locks for extended attributes. */ if (uio != NULL && uio->uio_offset != 0) return (ENXIO); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Read in the data header to see if the data is defined, and if so * how much. */ bzero(&ueh, sizeof(struct ufs_extattr_header)); local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_READ; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Acquire locks. * * Don't need to get a lock on the backing file if the getattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) - vn_lock(attribute->uele_backing_vnode, LK_SHARED | - LK_RETRY, td); + vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(attribute->uele_backing_vnode, &local_aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; /* Defined? */ if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { error = ENOATTR; goto vopunlock_exit; } /* Valid for the current inode generation? */ if (ueh.ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number * than the attribute data. For now, the best solution * is to coerce this to undefined, and let it get cleaned * up by the next write or extattrctl clean. */ printf("ufs_extattr_get (%s): inode number inconsistency (%d, %jd)\n", mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); error = ENOATTR; goto vopunlock_exit; } /* Local size consistency check. */ if (ueh.ueh_len > attribute->uele_fileheader.uef_size) { error = ENXIO; goto vopunlock_exit; } /* Return full data size if caller requested it. */ if (size != NULL) *size = ueh.ueh_len; /* Return data if the caller requested it. */ if (uio != NULL) { /* Allow for offset into the attribute data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); /* * Figure out maximum to transfer -- use buffer size and * local data limit. */ len = MIN(uio->uio_resid, ueh.ueh_len); old_len = uio->uio_resid; uio->uio_resid = len; error = VOP_READ(attribute->uele_backing_vnode, uio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; uio->uio_resid = old_len - (len - uio->uio_resid); } vopunlock_exit: if (uio != NULL) uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode, 0, td); return (error); } /* * Vnode operation to remove a named attribute. */ int ufs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump, ap->a_td); error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump, ap->a_td); return (error); } /* * Vnode operation to set a named attribute. */ int ufs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump, ap->a_td); /* * XXX: No longer a supported way to delete extended attributes. */ if (ap->a_uio == NULL) return (EINVAL); error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump, ap->a_td); return (error); } /* * Real work associated with setting a vnode's extended attributes; * assumes that the attribute lock has already been grabbed. */ static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (!ufs_extattr_valid_attrname(attrnamespace, name)) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, IWRITE); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Early rejection of invalid offsets/length. * Reject: any offset but 0 (replace) * Any size greater than attribute size limit */ if (uio->uio_offset != 0 || uio->uio_resid > attribute->uele_fileheader.uef_size) return (ENXIO); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Write out a data header for the data. */ ueh.ueh_len = uio->uio_resid; ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE; ueh.ueh_i_gen = ip->i_gen; local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Acquire locks. * * Don't need to get a lock on the backing file if the setattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) - vn_lock(attribute->uele_backing_vnode, - LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) { error = ENXIO; goto vopunlock_exit; } /* * Write out user data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, ump->um_extattr.uepm_ucred); vopunlock_exit: uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode, 0, td); return (error); } /* * Real work associated with removing an extended attribute from a vnode. * Assumes the attribute lock has already been grabbed. */ static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (!ufs_extattr_valid_attrname(attrnamespace, name)) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, IWRITE); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Check to see if currently defined. */ bzero(&ueh, sizeof(struct ufs_extattr_header)); local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_READ; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Don't need to get the lock on the backing vnode if the vnode we're * modifying is it, as we already hold the lock. */ if (attribute->uele_backing_vnode != vp) - vn_lock(attribute->uele_backing_vnode, - LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(attribute->uele_backing_vnode, &local_aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; /* Defined? */ if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { error = ENOATTR; goto vopunlock_exit; } /* Valid for the current inode generation? */ if (ueh.ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number than * the attribute data. For now, the best solution is to * coerce this to undefined, and let it get cleaned up by * the next write or extattrctl clean. */ printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n", mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); error = ENOATTR; goto vopunlock_exit; } /* Flag it as not in use. */ ueh.ueh_flags = 0; ueh.ueh_len = 0; local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) error = ENXIO; vopunlock_exit: VOP_UNLOCK(attribute->uele_backing_vnode, 0, td); return (error); } /* * Called by UFS when an inode is no longer active and should have its * attributes stripped. */ void ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td) { struct ufs_extattr_list_entry *uele; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); /* * In that case, we cannot lock. We should not have any active vnodes * on the fs if this is not yet initialized but is going to be, so * this can go unlocked. */ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) return; ufs_extattr_uepm_lock(ump, td); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { ufs_extattr_uepm_unlock(ump, td); return; } LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) ufs_extattr_rm(vp, uele->uele_attrnamespace, uele->uele_attrname, NULL, td); ufs_extattr_uepm_unlock(ump, td); } #endif /* !UFS_EXTATTR */ Index: head/sys/ufs/ufs/ufs_lookup.c =================================================================== --- head/sys/ufs/ufs/ufs_lookup.c (revision 175201) +++ head/sys/ufs/ufs/ufs_lookup.c (revision 175202) @@ -1,1253 +1,1253 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs_broken_fixme.h" #include "opt_ufs.h" #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #include #include #ifdef DIAGNOSTIC static int dirchk = 1; #else static int dirchk = 0; #endif SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); /* true if old FS format...*/ #define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the filesystem is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * This routine is actually used as VOP_CACHEDLOOKUP method, and the * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP * method. * * vfs_cache_lookup() performs the following for us: * check that it is a directory * check accessibility of directory * check for modification attempts on read-only mounts * if name found in cache * if at end of path and deleting or creating * drop it * else * return name. * return VOP_CACHEDLOOKUP() * * Overall outline of ufs_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ufs_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *vdp; /* vnode for directory being searched */ struct inode *dp; /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct direct *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ enum {NONE, COMPACT, FOUND} slotstatus; doff_t slotoffset; /* offset of area with free space */ int slotsize; /* size of area at slotoffset */ int slotfreespace; /* amount of space free in slot */ int slotneeded; /* size of the entry we're seeking */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* prev entry dp->i_offset */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ int namlen, error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; struct thread *td = cnp->cn_thread; ino_t saved_ino; bp = NULL; slotoffset = -1; /* * XXX there was a soft-update diff about this I couldn't merge. * I think this was the equiv. */ *vpp = NULL; vdp = ap->a_dvp; dp = VTOI(vdp); /* * We now have a segment name to search for, and a directory to search. * * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ slotstatus = FOUND; slotfreespace = slotsize = slotneeded = 0; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slotstatus = NONE; slotneeded = DIRECTSIZ(cnp->cn_namelen); } bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; #ifdef UFS_DIRHASH /* * Use dirhash for fast operations on large directories. The logic * to determine whether to hash the directory is contained within * ufsdirhash_build(); a zero return means that it decided to hash * this directory and it successfully built up the hash table. */ if (ufsdirhash_build(dp) == 0) { /* Look for a free slot if needed. */ enduseful = dp->i_size; if (slotstatus != FOUND) { slotoffset = ufsdirhash_findfree(dp, slotneeded, &slotsize); if (slotoffset >= 0) { slotstatus = COMPACT; enduseful = ufsdirhash_enduseful(dp); if (enduseful < 0) enduseful = dp->i_size; } } /* Look up the component. */ numdirpasses = 1; entryoffsetinblock = 0; /* silence compiler warning */ switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, &dp->i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { case 0: ep = (struct direct *)((char *)bp->b_data + (dp->i_offset & bmask)); goto foundentry; case ENOENT: dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); goto notfound; default: /* Something failed; just do a linear search. */ break; } } #endif /* UFS_DIRHASH */ /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ if (nameiop != LOOKUP || dp->i_diroff == 0 || dp->i_diroff >= dp->i_size) { entryoffsetinblock = 0; dp->i_offset = 0; numdirpasses = 1; } else { dp->i_offset = dp->i_diroff; if ((entryoffsetinblock = dp->i_offset & bmask) && (error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } prevoff = dp->i_offset; endsearch = roundup2(dp->i_size, DIRBLKSIZ); enduseful = 0; searchloop: while (dp->i_offset < endsearch) { /* * If necessary, get the next directory block. */ if ((dp->i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp); if (error) return (error); entryoffsetinblock = 0; } /* * If still looking for a slot, and at a DIRBLKSIZE * boundary, have to start looking for free space again. */ if (slotstatus == NONE && (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { slotoffset = -1; slotfreespace = 0; } /* * Get pointer to next entry. * Full validation checks are slow, so we only check * enough to insure forward progress through the * directory. Complete checks can be run by patching * "dirchk" to be true. */ ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { int i; ufs_dirbad(dp, dp->i_offset, "mangled entry"); i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); dp->i_offset += i; entryoffsetinblock += i; continue; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (slotstatus != FOUND) { int size = ep->d_reclen; if (ep->d_ino != 0) size -= DIRSIZ(OFSFMT(vdp), ep); if (size > 0) { if (size >= slotneeded) { slotstatus = FOUND; slotoffset = dp->i_offset; slotsize = ep->d_reclen; } else if (slotstatus == NONE) { slotfreespace += size; if (slotoffset == -1) slotoffset = dp->i_offset; if (slotfreespace >= slotneeded) { slotstatus = COMPACT; slotsize = dp->i_offset + ep->d_reclen - slotoffset; } } } } /* * Check for a name match. */ if (ep->d_ino) { # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vdp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (namlen == cnp->cn_namelen && (cnp->cn_nameptr[0] == ep->d_name[0]) && !bcmp(cnp->cn_nameptr, ep->d_name, (unsigned)namlen)) { #ifdef UFS_DIRHASH foundentry: #endif /* * Save directory entry's inode number and * reclen in ndp->ni_ufs area, and release * directory buffer. */ if (vdp->v_mount->mnt_maxsymlinklen > 0 && ep->d_type == DT_WHT) { slotstatus = FOUND; slotoffset = dp->i_offset; slotsize = ep->d_reclen; dp->i_reclen = slotsize; enduseful = dp->i_size; ap->a_cnp->cn_flags |= ISWHITEOUT; numdirpasses--; goto notfound; } dp->i_ino = ep->d_ino; dp->i_reclen = ep->d_reclen; goto found; } } prevoff = dp->i_offset; dp->i_offset += ep->d_reclen; entryoffsetinblock += ep->d_reclen; if (ep->d_ino) enduseful = dp->i_offset; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; dp->i_offset = 0; endsearch = dp->i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (ap->a_cnp->cn_flags & DOWHITEOUT) && (ap->a_cnp->cn_flags & ISWHITEOUT))) && (flags & ISLASTCN) && dp->i_effnlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. */ error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, * then set dp->i_count to 0 indicating * that the new slot belongs at the end of the * directory. If we found a slot, then the new entry * can be put in the range from dp->i_offset to * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); dp->i_count = 0; enduseful = dp->i_offset; } else if (nameiop == DELETE) { dp->i_offset = slotoffset; if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; } else { dp->i_offset = slotoffset; dp->i_count = slotsize; if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * The pathname buffer is saved so that the name * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(vdp, *vpp, cnp); return (ENOENT); found: if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Check that directory length properly reflects presence * of this entry. */ if (dp->i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { ufs_dirbad(dp, dp->i_offset, "i_size too small"); dp->i_size = dp->i_offset + DIRSIZ(OFSFMT(vdp), ep); DIP_SET(dp, i_size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; } brelse(bp); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { /* * Write access to directory required to delete files. */ error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) return (error); /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there * is a previous entry in this block) in dp->i_count. * Save directory inode pointer in ndp->ni_dvp for dirremove(). */ if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; if (dp->i_number == dp->i_ino) { VREF(vdp); *vpp = vdp; return (0); } if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ if ((dp->i_mode & ISVTX) && VOP_ACCESS(vdp, VADMIN, cred, cnp->cn_thread) && VOP_ACCESS(tdp, VADMIN, cred, cnp->cn_thread)) { vput(tdp); return (EPERM); } *vpp = tdp; return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread))) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ if (dp->i_number == dp->i_ino) return (EISDIR); if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); *vpp = tdp; cnp->cn_flags |= SAVENAME; return (0); } /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { saved_ino = dp->i_ino; VOP_UNLOCK(pdp, 0, td); /* race to get the inode */ error = VFS_VGET(pdp->v_mount, saved_ino, cnp->cn_lkflags, &tdp); - vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); *vpp = tdp; } else if (dp->i_number == dp->i_ino) { VREF(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { error = VFS_VGET(pdp->v_mount, dp->i_ino, cnp->cn_lkflags, &tdp); if (error) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } void ufs_dirbad(ip, offset, how) struct inode *ip; doff_t offset; char *how; { struct mount *mp; mp = ITOV(ip)->v_mount; if ((mp->mnt_flag & MNT_RDONLY) == 0) panic("ufs_dirbad: %s: bad dir ino %lu at offset %ld: %s", mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how); else (void)printf("%s: bad dir ino %lu at offset %ld: %s\n", mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than MAXNAMLEN * name must be as long as advertised, and null terminated */ int ufs_dirbadentry(dp, ep, entryoffsetinblock) struct vnode *dp; struct direct *ep; int entryoffsetinblock; { int i, namlen; # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > MAXNAMLEN) { /*return (1); */ printf("First bad\n"); goto bad; } if (ep->d_ino == 0) return (0); for (i = 0; i < namlen; i++) if (ep->d_name[i] == '\0') { /*return (1); */ printf("Second bad\n"); goto bad; } if (ep->d_name[i]) goto bad; return (0); bad: return (1); } /* * Construct a new directory entry after a call to namei, using the * parameters that it left in the componentname argument cnp. The * argument ip is the inode to which the new directory entry will refer. */ void ufs_makedirentry(ip, cnp, newdirp) struct inode *ip; struct componentname *cnp; struct direct *newdirp; { #ifdef INVARIANTS if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_makedirentry: missing name"); #endif newdirp->d_ino = ip->i_number; newdirp->d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1); if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) newdirp->d_type = IFTODT(ip->i_mode); else { newdirp->d_type = 0; # if (BYTE_ORDER == LITTLE_ENDIAN) { u_char tmp = newdirp->d_namlen; newdirp->d_namlen = newdirp->d_type; newdirp->d_type = tmp; } # endif } } /* * Write a directory entry after a call to namei, using the parameters * that it left in nameidata. The argument dirp is the new directory * entry contents. Dvp is a pointer to the directory to be written, * which was left locked by namei. Remaining parameters (dp->i_offset, * dp->i_count) indicate how the space for the new entry is to be obtained. * Non-null bp indicates that a directory is being created (for the * soft dependency code). */ int ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) struct vnode *dvp; struct vnode *tvp; struct direct *dirp; struct componentname *cnp; struct buf *newdirbp; { struct ucred *cr; struct thread *td; int newentrysize; struct inode *dp; struct buf *bp; u_int dsize; struct direct *ep, *nep; int error, ret, blkoff, loc, spacefree, flags, namlen; char *dirbuf; td = curthread; /* XXX */ cr = td->td_ucred; dp = VTOI(dvp); newentrysize = DIRSIZ(OFSFMT(dvp), dirp); if (dp->i_count == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (dp->i_offset & (DIRBLKSIZ - 1)) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) flags |= IO_SYNC; #ifdef QUOTA if ((error = getinoquota(dp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } #endif if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } dp->i_size = dp->i_offset + DIRBLKSIZ; DIP_SET(dp, i_size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; vnode_pager_setsize(dvp, (u_long)dp->i_size); dirp->d_reclen = DIRBLKSIZ; blkoff = dp->i_offset & (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, dp->i_offset); ufsdirhash_add(dp, dirp, dp->i_offset); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, dp->i_offset); } #endif if (DOINGSOFTDEP(dvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff += DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, dp->i_offset, dirp->d_ino, newdirbp, 1) == 0) { bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } /* We have just allocated a directory block in an * indirect block. Rather than tracking when it gets * claimed by the inode, we simply do a VOP_FSYNC * now to ensure that it is there (in case the user * does a future fsync). Note that we have to unlock * the inode for the entry that we just entered, as * the VOP_FSYNC may need to lock other inodes which * can lead to deadlock if we also hold a lock on * the newly entered node. */ if ((error = bwrite(bp))) return (error); if (tvp != NULL) VOP_UNLOCK(tvp, 0, td); error = VOP_FSYNC(dvp, MNT_WAIT, td); if (tvp != NULL) - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); return (error); } if (DOINGASYNC(dvp)) { bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } error = bwrite(bp); ret = UFS_UPDATE(dvp, 1); if (error == 0) return (ret); return (error); } /* * If dp->i_count is non-zero, then namei found space for the new * entry in the range dp->i_offset to dp->i_offset + dp->i_count * in the directory. To use this space, we may have to compact * the entries located there, by copying them together towards the * beginning of the block, leaving the free space in one usable * chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZE. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (dp->i_offset + dp->i_count > dp->i_size) { dp->i_size = dp->i_offset + dp->i_count; DIP_SET(dp, i_size, dp->i_size); } /* * Get the block containing the space for the new directory entry. */ error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp); if (error) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region dp->i_offset to * dp->i_offset + dp->i_count would yield the space. */ ep = (struct direct *)dirbuf; dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; spacefree = ep->d_reclen - dsize; for (loc = ep->d_reclen; loc < dp->i_count; ) { nep = (struct direct *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); /* Read nep->d_reclen now as the bcopy() may clobber it. */ loc += nep->d_reclen; if (nep->d_ino == 0) { /* * A mid-block unused entry. Such entries are * never created by the kernel, but fsck_ffs * can create them (and it doesn't fix them). * * Add up the free space, and initialise the * relocated entry since we don't bcopy it. */ spacefree += nep->d_reclen; ep->d_ino = 0; dsize = 0; continue; } dsize = DIRSIZ(OFSFMT(dvp), nep); spacefree += nep->d_reclen - dsize; #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, dp->i_offset + ((char *)nep - dirbuf), dp->i_offset + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) softdep_change_directoryentry_offset(dp, dirbuf, (caddr_t)nep, (caddr_t)ep, dsize); else bcopy((caddr_t)nep, (caddr_t)ep, dsize); } /* * Here, `ep' points to a directory entry containing `dsize' in-use * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, * then the entry is completely unused (dsize == 0). The value * of ep->d_reclen is always indeterminate. * * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dvp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (ep->d_ino == 0 || (ep->d_ino == WINO && namlen == dirp->d_namlen && bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { if (spacefree + dsize < newentrysize) panic("ufs_direnter: compact1"); dirp->d_reclen = spacefree + dsize; } else { if (spacefree < newentrysize) panic("ufs_direnter: compact2"); dirp->d_reclen = spacefree; ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf)); #endif bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - (dp->i_offset & (DIRBLKSIZ - 1)), dp->i_offset & ~(DIRBLKSIZ - 1)); #endif if (DOINGSOFTDEP(dvp)) { (void) softdep_setup_directory_add(bp, dp, dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If all went well, and the directory can be shortened, proceed * with the truncation. Note that we have to unlock the inode for * the entry that we just entered, as the truncation may need to * lock other inodes which can lead to deadlock if we also hold a * lock on the newly entered node. */ if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp, 0, td); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, dp->i_endoff); #endif (void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_NORMAL | IO_SYNC, cr, td); if (tvp != NULL) - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); } return (error); } /* * Remove a directory entry after a call to namei, using * the parameters which it left in nameidata. The entry * dp->i_offset contains the offset into the directory of the * entry to be eliminated. The dp->i_count field contains the * size of the previous record in the directory. If this * is 0, the first entry is being deleted, so we need only * zero the inode number to mark the entry as free. If the * entry is not the first in the directory, we must reclaim * the space of the now empty record by adding the record size * to the size of the previous entry. */ int ufs_dirremove(dvp, ip, flags, isrmdir) struct vnode *dvp; struct inode *ip; int flags; int isrmdir; { struct inode *dp; struct direct *ep; struct buf *bp; int error; dp = VTOI(dvp); if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to WINO. */ if ((error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) return (error); ep->d_ino = WINO; ep->d_type = DT_WHT; goto out; } if ((error = UFS_BLKATOFF(dvp, (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) return (error); #ifdef UFS_DIRHASH /* * Remove the dirhash entry. This is complicated by the fact * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, (dp->i_count == 0) ? ep : (struct direct *)((char *)ep + ep->d_reclen), dp->i_offset); #endif if (dp->i_count == 0) { /* * First entry in block: set d_ino to zero. */ ep->d_ino = 0; } else { /* * Collapse new free space into previous entry. */ ep->d_reclen += dp->i_reclen; } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), dp->i_offset & ~(DIRBLKSIZ - 1)); #endif out: if (DOINGSOFTDEP(dvp)) { if (ip) { ip->i_effnlink--; softdep_change_linkcnt(ip); softdep_setup_remove(bp, dp, ip, isrmdir); } if (softdep_slowdown(dvp)) { error = bwrite(bp); } else { bdwrite(bp); error = 0; } } else { if (ip) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; } if (flags & DOWHITEOUT) error = bwrite(bp); else if (DOINGASYNC(dvp) && dp->i_count != 0) { bdwrite(bp); error = 0; } else error = bwrite(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ #if defined(FFS) || defined(IFS) if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_effnlink == 0) ffs_snapgone(ip); #endif return (error); } /* * Rewrite an existing directory entry to point at the inode * supplied. The parameters describing the directory entry are * set up by a call to namei. */ int ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) struct inode *dp, *oip; ino_t newinum; int newtype; int isrmdir; { struct buf *bp; struct direct *ep; struct vnode *vdp = ITOV(dp); int error; error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); if (error) return (error); ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; oip->i_effnlink--; if (DOINGSOFTDEP(vdp)) { softdep_change_linkcnt(oip); softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { oip->i_nlink--; DIP_SET(oip, i_nlink, oip->i_nlink); oip->i_flag |= IN_CHANGE; if (DOINGASYNC(vdp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ #if defined(FFS) || defined(IFS) if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_effnlink == 0) ffs_snapgone(oip); #endif return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ufs_dirempty(ip, parentino, cred) struct inode *ip; ino_t parentino; struct ucred *cred; { doff_t off; struct dirtemplate dbuf; struct direct *dp = (struct direct *)&dbuf; int error, count, namlen; #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += dp->d_reclen) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, &count, (struct thread *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->d_reclen == 0) return (0); /* skip empty entries */ if (dp->d_ino == 0 || dp->d_ino == WINO) continue; /* accept only "." and ".." */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(ITOV(ip))) namlen = dp->d_type; else namlen = dp->d_namlen; # else namlen = dp->d_namlen; # endif if (namlen > 2) return (0); if (dp->d_name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1 && dp->d_ino == ip->i_number) continue; if (dp->d_name[1] == '.' && dp->d_ino == parentino) continue; return (0); } return (1); } /* * Check if source directory is in the path of the target directory. * Target is supplied locked, source is unlocked. * The target is always vput before returning. */ int ufs_checkpath(source, target, cred) struct inode *source, *target; struct ucred *cred; { struct vnode *vp; int error, namlen; ino_t rootino; struct dirtemplate dirbuf; vp = ITOV(target); if (target->i_number == source->i_number) { error = EEXIST; goto out; } rootino = ROOTINO; error = 0; if (target->i_number == rootino) goto out; for (;;) { if (vp->v_type != VDIR) { error = ENOTDIR; break; } error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, (int *)0, (struct thread *)0); if (error != 0) break; # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vp)) namlen = dirbuf.dotdot_type; else namlen = dirbuf.dotdot_namlen; # else namlen = dirbuf.dotdot_namlen; # endif if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') { error = ENOTDIR; break; } if (dirbuf.dotdot_ino == source->i_number) { error = EINVAL; break; } if (dirbuf.dotdot_ino == rootino) break; vput(vp); error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino, LK_EXCLUSIVE, &vp); if (error) { vp = NULL; break; } } out: if (error == ENOTDIR) printf("checkpath: .. not a directory\n"); if (vp != NULL) vput(vp); return (error); } Index: head/sys/ufs/ufs/ufs_quota.c =================================================================== --- head/sys/ufs/ufs/ufs_quota.c (revision 175201) +++ head/sys/ufs/ufs/ufs_quota.c (revision 175202) @@ -1,1449 +1,1447 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int unprivileged_get_quota = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW, &unprivileged_get_quota, 0, "Unprivileged processes may retrieve quotas for other uids and gids"); static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries"); /* * Quota name to error message mapping. */ static char *quotatypes[] = INITQFNAMES; static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *); static int chkiqchg(struct inode *, int, struct ucred *, int, int *); static int dqget(struct vnode *, u_long, struct ufsmount *, int, struct dquot **); static int dqsync(struct vnode *, struct dquot *); static void dqflush(struct vnode *); static int quotaoff1(struct thread *td, struct mount *mp, int type); static int quotaoff_inchange(struct thread *td, struct mount *mp, int type); #ifdef DIAGNOSTIC static void dqref(struct dquot *); static void chkdquot(struct inode *); #endif /* * Set up the quotas for an inode. * * This routine completely defines the semantics of quotas. * If other criterion want to be used to establish quotas, the * MAXQUOTAS value in quotas.h should be increased, and the * additional dquots set up here. */ int getinoquota(ip) struct inode *ip; { struct ufsmount *ump; struct vnode *vp; int error; vp = ITOV(ip); /* * Disk quotas must be turned off for system files. Currently * snapshot and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return (0); /* * XXX: Turn off quotas for files with a negative UID or GID. * This prevents the creation of 100GB+ quota files. */ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) return (0); ump = VFSTOUFS(vp->v_mount); /* * Set up the user quota based on file uid. * EINVAL means that quotas are not enabled. */ if ((error = dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && error != EINVAL) return (error); /* * Set up the group quota based on file gid. * EINVAL means that quotas are not enabled. */ if ((error = dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && error != EINVAL) return (error); return (0); } /* * Update disk usage, and take corrective action. */ int chkdq(ip, change, cred, flags) struct inode *ip; ufs2_daddr_t change; struct ucred *cred; int flags; { struct dquot *dq; ufs2_daddr_t ncurblocks; struct vnode *vp = ITOV(ip); int i, error, warn, do_check; /* * Disk quotas must be turned off for system files. Currently * snapshot and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return (0); /* * XXX: Turn off quotas for files with a negative UID or GID. * This prevents the creation of 100GB+ quota files. */ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) return (0); #ifdef DIAGNOSTIC if ((flags & CHOWN) == 0) chkdquot(ip); #endif if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkdq1"); ncurblocks = dq->dq_curblocks + change; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (0); } if ((flags & FORCE) == 0 && priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0)) do_check = 1; else do_check = 0; for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; warn = 0; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkdq2"); if (do_check) { error = chkdqchg(ip, change, cred, i, &warn); if (error) { /* * Roll back user quota changes when * group quota failed. */ while (i > 0) { --i; dq = ip->i_dquot[i]; if (dq == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkdq3"); ncurblocks = dq->dq_curblocks - change; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (error); } } /* Reset timer when crossing soft limit */ if (dq->dq_curblocks + change >= dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_btime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i]; dq->dq_curblocks += change; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); if (warn) uprintf("\n%s: warning, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[i], "disk quota exceeded"); } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkdqchg(ip, change, cred, type, warn) struct inode *ip; ufs2_daddr_t change; struct ucred *cred; int type; int *warn; { struct dquot *dq = ip->i_dquot[type]; ufs2_daddr_t ncurblocks = dq->dq_curblocks + change; /* * If user would exceed their hard limit, disallow space allocation. */ if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { if ((dq->dq_flags & DQ_BLKS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_BLKS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s disk limit reached\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } /* * If user is over their soft limit for too long, disallow space * allocation. Reset time limit as they cross their soft limit. */ if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { if (dq->dq_curblocks < dq->dq_bsoftlimit) { dq->dq_btime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type]; if (ip->i_uid == cred->cr_uid) *warn = 1; return (0); } if (time_second > dq->dq_btime) { if ((dq->dq_flags & DQ_BLKS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_BLKS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "disk quota exceeded for too long"); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } } return (0); } /* * Check the inode limit, applying corrective action. */ int chkiq(ip, change, cred, flags) struct inode *ip; int change; struct ucred *cred; int flags; { struct dquot *dq; ino_t ncurinodes; int i, error, warn, do_check; #ifdef DIAGNOSTIC if ((flags & CHOWN) == 0) chkdquot(ip); #endif if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkiq1"); ncurinodes = dq->dq_curinodes + change; /* XXX: ncurinodes is unsigned */ if (dq->dq_curinodes != 0 && ncurinodes >= 0) dq->dq_curinodes = ncurinodes; else dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (0); } if ((flags & FORCE) == 0 && priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0)) do_check = 1; else do_check = 0; for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; warn = 0; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkiq2"); if (do_check) { error = chkiqchg(ip, change, cred, i, &warn); if (error) { /* * Roll back user quota changes when * group quota failed. */ while (i > 0) { --i; dq = ip->i_dquot[i]; if (dq == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkiq3"); ncurinodes = dq->dq_curinodes - change; /* XXX: ncurinodes is unsigned */ if (dq->dq_curinodes != 0 && ncurinodes >= 0) dq->dq_curinodes = ncurinodes; else dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (error); } } /* Reset timer when crossing soft limit */ if (dq->dq_curinodes + change >= dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_itime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i]; dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); if (warn) uprintf("\n%s: warning, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[i], "inode quota exceeded"); } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkiqchg(ip, change, cred, type, warn) struct inode *ip; int change; struct ucred *cred; int type; int *warn; { struct dquot *dq = ip->i_dquot[type]; ino_t ncurinodes = dq->dq_curinodes + change; /* * If user would exceed their hard limit, disallow inode allocation. */ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { if ((dq->dq_flags & DQ_INODS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_INODS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s inode limit reached\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } /* * If user is over their soft limit for too long, disallow inode * allocation. Reset time limit as they cross their soft limit. */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { if (dq->dq_curinodes < dq->dq_isoftlimit) { dq->dq_itime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type]; if (ip->i_uid == cred->cr_uid) *warn = 1; return (0); } if (time_second > dq->dq_itime) { if ((dq->dq_flags & DQ_INODS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_INODS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "inode quota exceeded for too long"); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } } return (0); } #ifdef DIAGNOSTIC /* * On filesystems with quotas enabled, it is an error for a file to change * size and not to have a dquot structure associated with it. */ static void chkdquot(ip) struct inode *ip; { struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount); struct vnode *vp = ITOV(ip); int i; /* * Disk quotas must be turned off for system files. Currently * these are snapshots and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return; /* * XXX: Turn off quotas for files with a negative UID or GID. * This prevents the creation of 100GB+ quota files. */ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) return; UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP || (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) continue; if (ip->i_dquot[i] == NODQUOT) { UFS_UNLOCK(ump); vprint("chkdquot: missing dquot", ITOV(ip)); panic("chkdquot: missing dquot"); } } UFS_UNLOCK(ump); } #endif /* * Code to process quotactl commands. */ /* * Q_QUOTAON - set up a quota file for a particular filesystem. */ int quotaon(td, mp, type, fname) struct thread *td; struct mount *mp; int type; void *fname; { struct ufsmount *ump; struct vnode *vp, **vpp; struct vnode *mvp; struct dquot *dq; int error, flags, vfslocked; struct nameidata nd; error = priv_check(td, PRIV_UFS_QUOTAON); if (error) return (error); ump = VFSTOUFS(mp); dq = NODQUOT; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, fname, td); flags = FREAD | FWRITE; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (EACCES); } UFS_LOCK(ump); if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { UFS_UNLOCK(ump); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (EALREADY); } ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING; MNT_ILOCK(mp); mp->mnt_flag |= MNT_QUOTA; MNT_IUNLOCK(mp); UFS_UNLOCK(ump); vpp = &ump->um_quotas[type]; if (*vpp != vp) quotaoff1(td, mp, type); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_SYSTEM; VOP_UNLOCK(vp, 0, td); *vpp = vp; VFS_UNLOCK_GIANT(vfslocked); /* * Save the credential of the process that turned on quotas. * Set up the time limits for this quota. */ ump->um_cred[type] = crhold(td->td_ucred); ump->um_btime[type] = MAX_DQ_TIME; ump->um_itime[type] = MAX_IQ_TIME; if (dqget(NULLVP, 0, ump, type, &dq) == 0) { if (dq->dq_btime > 0) ump->um_btime[type] = dq->dq_btime; if (dq->dq_itime > 0) ump->um_itime[type] = dq->dq_itime; dqrele(NULLVP, dq); } /* * Allow the getdq from getinoquota below to read the quota * from file. */ UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_CLOSING; UFS_UNLOCK(ump); /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for inodes being modified. */ MNT_ILOCK(mp); again: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto again; } if (vp->v_type == VNON || vp->v_writecount == 0) { VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); continue; } error = getinoquota(VTOI(vp)); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); if (error) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); break; } } MNT_IUNLOCK(mp); if (error) quotaoff_inchange(td, mp, type); UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_OPENING; KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0, ("quotaon: leaking flags")); UFS_UNLOCK(ump); return (error); } /* * Main code to turn off disk quotas for a filesystem. Does not change * flags. */ static int quotaoff1(td, mp, type) struct thread *td; struct mount *mp; int type; { struct vnode *vp; struct vnode *qvp, *mvp; struct ufsmount *ump; struct dquot *dq; struct inode *ip; struct ucred *cr; int vfslocked; int error; ump = VFSTOUFS(mp); UFS_LOCK(ump); KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0, ("quotaoff1: flags are invalid")); if ((qvp = ump->um_quotas[type]) == NULLVP) { UFS_UNLOCK(ump); return (0); } cr = ump->um_cred[type]; UFS_UNLOCK(ump); /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. */ MNT_ILOCK(mp); again: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (vp->v_type == VNON) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto again; } ip = VTOI(vp); dq = ip->i_dquot[type]; ip->i_dquot[type] = NODQUOT; dqrele(vp, dq); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); dqflush(qvp); /* Clear um_quotas before closing the quota vnode to prevent * access to the closed vnode from dqget/dqsync */ UFS_LOCK(ump); ump->um_quotas[type] = NULLVP; ump->um_cred[type] = NOCRED; UFS_UNLOCK(ump); vfslocked = VFS_LOCK_GIANT(qvp->v_mount); - vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY); qvp->v_vflag &= ~VV_SYSTEM; VOP_UNLOCK(qvp, 0, td); error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); crfree(cr); return (error); } /* * Turns off quotas, assumes that ump->um_qflags are already checked * and QTF_CLOSING is set to indicate operation in progress. Fixes * ump->um_qflags and mp->mnt_flag after. */ int quotaoff_inchange(td, mp, type) struct thread *td; struct mount *mp; int type; { struct ufsmount *ump; int i; int error; error = quotaoff1(td, mp, type); ump = VFSTOUFS(mp); UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_CLOSING; for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; if (i == MAXQUOTAS) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_QUOTA; MNT_IUNLOCK(mp); } UFS_UNLOCK(ump); return (error); } /* * Q_QUOTAOFF - turn off disk quotas for a filesystem. */ int quotaoff(td, mp, type) struct thread *td; struct mount *mp; int type; { struct ufsmount *ump; int error; error = priv_check(td, PRIV_UFS_QUOTAOFF); if (error) return (error); ump = VFSTOUFS(mp); UFS_LOCK(ump); if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { UFS_UNLOCK(ump); return (EALREADY); } ump->um_qflags[type] |= QTF_CLOSING; UFS_UNLOCK(ump); return (quotaoff_inchange(td, mp, type)); } /* * Q_GETQUOTA - return current values in a dqblk structure. */ int getquota(td, mp, id, type, addr) struct thread *td; struct mount *mp; u_long id; int type; void *addr; { struct dquot *dq; int error; switch (type) { case USRQUOTA: if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) { error = priv_check(td, PRIV_VFS_GETQUOTA); if (error) return (error); } break; case GRPQUOTA: if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) { error = priv_check(td, PRIV_VFS_GETQUOTA); if (error) return (error); } break; default: return (EINVAL); } dq = NODQUOT; error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq); if (error) return (error); error = copyout(&dq->dq_dqb, addr, sizeof (struct dqblk)); dqrele(NULLVP, dq); return (error); } /* * Q_SETQUOTA - assign an entire dqblk structure. */ int setquota(td, mp, id, type, addr) struct thread *td; struct mount *mp; u_long id; int type; void *addr; { struct dquot *dq; struct dquot *ndq; struct ufsmount *ump; struct dqblk newlim; int error; error = priv_check(td, PRIV_VFS_SETQUOTA); if (error) return (error); ump = VFSTOUFS(mp); error = copyin(addr, &newlim, sizeof (struct dqblk)); if (error) return (error); ndq = NODQUOT; ump = VFSTOUFS(mp); error = dqget(NULLVP, id, ump, type, &ndq); if (error) return (error); dq = ndq; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "setqta"); /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ newlim.dqb_curblocks = dq->dq_curblocks; newlim.dqb_curinodes = dq->dq_curinodes; if (dq->dq_id != 0) { newlim.dqb_btime = dq->dq_btime; newlim.dqb_itime = dq->dq_itime; } if (newlim.dqb_bsoftlimit && dq->dq_curblocks >= newlim.dqb_bsoftlimit && (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) newlim.dqb_btime = time_second + ump->um_btime[type]; if (newlim.dqb_isoftlimit && dq->dq_curinodes >= newlim.dqb_isoftlimit && (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) newlim.dqb_itime = time_second + ump->um_itime[type]; dq->dq_dqb = newlim; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); dqrele(NULLVP, dq); return (0); } /* * Q_SETUSE - set current inode and block usage. */ int setuse(td, mp, id, type, addr) struct thread *td; struct mount *mp; u_long id; int type; void *addr; { struct dquot *dq; struct ufsmount *ump; struct dquot *ndq; struct dqblk usage; int error; error = priv_check(td, PRIV_UFS_SETUSE); if (error) return (error); ump = VFSTOUFS(mp); error = copyin(addr, &usage, sizeof (struct dqblk)); if (error) return (error); ump = VFSTOUFS(mp); ndq = NODQUOT; error = dqget(NULLVP, id, ump, type, &ndq); if (error) return (error); dq = ndq; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "setuse"); /* * Reset time limit if have a soft limit and were * previously under it, but are now over it. */ if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && usage.dqb_curblocks >= dq->dq_bsoftlimit) dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && usage.dqb_curinodes >= dq->dq_isoftlimit) dq->dq_itime = time_second + ump->um_itime[type]; dq->dq_curblocks = usage.dqb_curblocks; dq->dq_curinodes = usage.dqb_curinodes; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); dqrele(NULLVP, dq); return (0); } /* * Q_SYNC - sync quota files to disk. */ int qsync(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct thread *td = curthread; /* XXX */ struct vnode *vp, *mvp; struct dquot *dq; int i, error; /* * Check if the mount point has any quotas. * If not, simply return. */ UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; UFS_UNLOCK(ump); if (i == MAXQUOTAS) return (0); /* * Search vnodes associated with this mount point, * synchronizing any modified dquot structures. */ MNT_ILOCK(mp); again: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (vp->v_type == VNON) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); if (error) { MNT_ILOCK(mp); if (error == ENOENT) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto again; } continue; } for (i = 0; i < MAXQUOTAS; i++) { dq = VTOI(vp)->i_dquot[i]; if (dq != NODQUOT) dqsync(vp, dq); } vput(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Code pertaining to management of the in-core dquot data structures. */ #define DQHASH(dqvp, id) \ (&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash]) static LIST_HEAD(dqhash, dquot) *dqhashtbl; static u_long dqhash; /* * Dquot free list. */ #define DQUOTINC 5 /* minimum free dquots desired */ static TAILQ_HEAD(dqfreelist, dquot) dqfreelist; static long numdquot, desireddquot = DQUOTINC; /* * Lock to protect quota hash, dq free list and dq_cnt ref counters of * _all_ dqs. */ struct mtx dqhlock; #define DQH_LOCK() mtx_lock(&dqhlock) #define DQH_UNLOCK() mtx_unlock(&dqhlock) static struct dquot *dqhashfind(struct dqhash *dqh, u_long id, struct vnode *dqvp); /* * Initialize the quota system. */ void dqinit() { mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF); dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); TAILQ_INIT(&dqfreelist); } /* * Shut down the quota system. */ void dquninit() { struct dquot *dq; hashdestroy(dqhashtbl, M_DQUOT, dqhash); while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) { TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); mtx_destroy(&dq->dq_lock); free(dq, M_DQUOT); } mtx_destroy(&dqhlock); } static struct dquot * dqhashfind(dqh, id, dqvp) struct dqhash *dqh; u_long id; struct vnode *dqvp; { struct dquot *dq; mtx_assert(&dqhlock, MA_OWNED); LIST_FOREACH(dq, dqh, dq_hash) { if (dq->dq_id != id || dq->dq_ump->um_quotas[dq->dq_type] != dqvp) continue; /* * Cache hit with no references. Take * the structure off the free list. */ if (dq->dq_cnt == 0) TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); DQREF(dq); return (dq); } return (NODQUOT); } /* * Obtain a dquot structure for the specified identifier and quota file * reading the information from the file if necessary. */ static int dqget(vp, id, ump, type, dqp) struct vnode *vp; u_long id; struct ufsmount *ump; int type; struct dquot **dqp; { - struct thread *td = curthread; /* XXX */ struct dquot *dq, *dq1; struct dqhash *dqh; struct vnode *dqvp; struct iovec aiov; struct uio auio; int vfslocked, dqvplocked, error; #ifdef DEBUG_VFS_LOCKS if (vp != NULLVP) ASSERT_VOP_ELOCKED(vp, "dqget"); #endif if (vp != NULLVP && *dqp != NODQUOT) { return (0); } /* XXX: Disallow negative id values to prevent the * creation of 100GB+ quota data files. */ if ((int)id < 0) return (EINVAL); UFS_LOCK(ump); dqvp = ump->um_quotas[type]; if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { *dqp = NODQUOT; UFS_UNLOCK(ump); return (EINVAL); } vref(dqvp); UFS_UNLOCK(ump); error = 0; dqvplocked = 0; /* * Check the cache first. */ dqh = DQHASH(dqvp, id); DQH_LOCK(); dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { DQH_UNLOCK(); hfound: DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "dqget"); DQI_UNLOCK(dq); if (dq->dq_ump == NULL) { dqrele(vp, dq); dq = NODQUOT; error = EIO; } *dqp = dq; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); if (dqvplocked) vput(dqvp); else vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there * since new dq will appear on the hash chain DQ_LOCKed. */ if (vp != dqvp) { DQH_UNLOCK(); - vn_lock(dqvp, LK_SHARED | LK_RETRY, td); + vn_lock(dqvp, LK_SHARED | LK_RETRY); dqvplocked = 1; DQH_LOCK(); /* * Recheck the cache after sleep for quota vnode lock. */ dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { DQH_UNLOCK(); goto hfound; } } /* * Not in cache, allocate a new one or take it from the * free list. */ if (TAILQ_FIRST(&dqfreelist) == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) desireddquot += DQUOTINC; if (numdquot < desireddquot) { numdquot++; DQH_UNLOCK(); dq1 = (struct dquot *)malloc(sizeof *dq, M_DQUOT, M_WAITOK | M_ZERO); mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF); DQH_LOCK(); /* * Recheck the cache after sleep for memory. */ dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { numdquot--; DQH_UNLOCK(); mtx_destroy(&dq1->dq_lock); free(dq1, M_DQUOT); goto hfound; } dq = dq1; } else { if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) { DQH_UNLOCK(); tablefull("dquot"); *dqp = NODQUOT; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); if (dqvplocked) vput(dqvp); else vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); return (EUSERS); } if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) panic("dqget: free dquot isn't"); TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); if (dq->dq_ump != NULL) LIST_REMOVE(dq, dq_hash); } /* * Dq is put into hash already locked to prevent parallel * usage while it is being read from file. */ dq->dq_flags = DQ_LOCK; dq->dq_id = id; dq->dq_type = type; dq->dq_ump = ump; LIST_INSERT_HEAD(dqh, dq, dq_hash); DQREF(dq); DQH_UNLOCK(); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = &dq->dq_dqb; aiov.iov_len = sizeof (struct dqblk); auio.uio_resid = sizeof (struct dqblk); auio.uio_offset = (off_t)id * sizeof (struct dqblk); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = (struct thread *)0; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); if (auio.uio_resid == sizeof(struct dqblk) && error == 0) bzero(&dq->dq_dqb, sizeof(struct dqblk)); if (dqvplocked) vput(dqvp); else vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) { DQH_LOCK(); dq->dq_ump = NULL; LIST_REMOVE(dq, dq_hash); DQH_UNLOCK(); DQI_LOCK(dq); if (dq->dq_flags & DQ_WANT) wakeup(dq); dq->dq_flags = 0; DQI_UNLOCK(dq); dqrele(vp, dq); *dqp = NODQUOT; return (error); } DQI_LOCK(dq); /* * Check for no limit to enforce. * Initialize time values if necessary. */ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; if (dq->dq_id != 0) { if (dq->dq_btime == 0) { dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_bsoftlimit && dq->dq_curblocks >= dq->dq_bsoftlimit) dq->dq_flags |= DQ_MOD; } if (dq->dq_itime == 0) { dq->dq_itime = time_second + ump->um_itime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes >= dq->dq_isoftlimit) dq->dq_flags |= DQ_MOD; } } DQI_WAKEUP(dq); DQI_UNLOCK(dq); *dqp = dq; return (0); } #ifdef DIAGNOSTIC /* * Obtain a reference to a dquot. */ static void dqref(dq) struct dquot *dq; { dq->dq_cnt++; } #endif /* * Release a reference to a dquot. */ void dqrele(vp, dq) struct vnode *vp; struct dquot *dq; { if (dq == NODQUOT) return; DQH_LOCK(); if (dq->dq_cnt > 1) { dq->dq_cnt--; DQH_UNLOCK(); return; } DQH_UNLOCK(); (void) dqsync(vp, dq); DQH_LOCK(); if (--dq->dq_cnt > 0) { DQH_UNLOCK(); return; } TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); DQH_UNLOCK(); } /* * Update the disk quota in the quota file. */ static int dqsync(vp, dq) struct vnode *vp; struct dquot *dq; { - struct thread *td = curthread; /* XXX */ struct vnode *dqvp; struct iovec aiov; struct uio auio; int vfslocked, error; struct mount *mp; struct ufsmount *ump; #ifdef DEBUG_VFS_LOCKS if (vp != NULL) ASSERT_VOP_ELOCKED(vp, "dqsync"); #endif mp = NULL; error = 0; if (dq == NODQUOT) panic("dqsync: dquot"); if ((ump = dq->dq_ump) == NULL) return (0); UFS_LOCK(ump); if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP) panic("dqsync: file"); vref(dqvp); UFS_UNLOCK(ump); vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); DQI_LOCK(dq); if ((dq->dq_flags & DQ_MOD) == 0) { DQI_UNLOCK(dq); vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); return (0); } DQI_UNLOCK(dq); (void) vn_start_secondary_write(dqvp, &mp, V_WAIT); if (vp != dqvp) - vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); VFS_UNLOCK_GIANT(vfslocked); DQI_LOCK(dq); DQI_WAIT(dq, PINOD+2, "dqsync"); if ((dq->dq_flags & DQ_MOD) == 0) goto out; dq->dq_flags |= DQ_LOCK; DQI_UNLOCK(dq); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = &dq->dq_dqb; aiov.iov_len = sizeof (struct dqblk); auio.uio_resid = sizeof (struct dqblk); auio.uio_offset = (off_t)dq->dq_id * sizeof (struct dqblk); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = (struct thread *)0; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); VFS_UNLOCK_GIANT(vfslocked); if (auio.uio_resid && error == 0) error = EIO; DQI_LOCK(dq); DQI_WAKEUP(dq); dq->dq_flags &= ~DQ_MOD; out: DQI_UNLOCK(dq); vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); if (vp != dqvp) vput(dqvp); else vrele(dqvp); vn_finished_secondary_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Flush all entries from the cache for a particular vnode. */ static void dqflush(vp) struct vnode *vp; { struct dquot *dq, *nextdq; struct dqhash *dqh; /* * Move all dquot's that used to refer to this quota * file off their hash chains (they will eventually * fall off the head of the free list and be re-used). */ DQH_LOCK(); for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) { for (dq = LIST_FIRST(dqh); dq; dq = nextdq) { nextdq = LIST_NEXT(dq, dq_hash); if (dq->dq_ump->um_quotas[dq->dq_type] != vp) continue; if (dq->dq_cnt) panic("dqflush: stray dquot"); LIST_REMOVE(dq, dq_hash); dq->dq_ump = (struct ufsmount *)0; } } DQH_UNLOCK(); } Index: head/sys/ufs/ufs/ufs_vnops.c =================================================================== --- head/sys/ufs/ufs/ufs_vnops.c (revision 175201) +++ head/sys/ufs/ufs/ufs_vnops.c (revision 175202) @@ -1,2517 +1,2517 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_quota.h" #include "opt_suiddir.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #ifdef UFS_GJOURNAL #include #endif #include static vop_access_t ufs_access; static vop_advlock_t ufs_advlock; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; static vop_create_t ufs_create; static vop_getattr_t ufs_getattr; static vop_link_t ufs_link; static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); static vop_mkdir_t ufs_mkdir; static vop_mknod_t ufs_mknod; static vop_open_t ufs_open; static vop_pathconf_t ufs_pathconf; static vop_print_t ufs_print; static vop_readlink_t ufs_readlink; static vop_remove_t ufs_remove; static vop_rename_t ufs_rename; static vop_rmdir_t ufs_rmdir; static vop_setattr_t ufs_setattr; static vop_strategy_t ufs_strategy; static vop_symlink_t ufs_symlink; static vop_whiteout_t ufs_whiteout; static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; static void ufs_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0) goto out; if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) ip->i_flag |= IN_LAZYMOD; else if (((vp->v_mount->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) || (ip->i_flag & (IN_CHANGE | IN_UPDATE))) ip->i_flag |= IN_MODIFIED; else if (ip->i_flag & IN_ACCESS) ip->i_flag |= IN_LAZYACCESS; vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { DIP_SET(ip, i_atime, ts.tv_sec); DIP_SET(ip, i_atimensec, ts.tv_nsec); } if (ip->i_flag & IN_UPDATE) { DIP_SET(ip, i_mtime, ts.tv_sec); DIP_SET(ip, i_mtimensec, ts.tv_nsec); ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { DIP_SET(ip, i_ctime, ts.tv_sec); DIP_SET(ip, i_ctimensec, ts.tv_nsec); } out: ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ufs_itimes(struct vnode *vp) { VI_LOCK(vp); ufs_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error) return (error); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ static int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ DIP_SET(ip, i_rdev, vap->va_rdev); } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. */ /* ARGSUSED */ static int ufs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); ip = VTOI(vp); /* * Files marked append-only must be opened for appending. */ if ((ip->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ufs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); mode_t mode = ap->a_mode; int error; #ifdef UFS_ACL struct acl *acl; #endif /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_ACLS) != 0) { acl = uma_zalloc(acl_zone, M_WAITOK); error = VOP_GETACL(vp, ACL_TYPE_ACCESS, acl, ap->a_cred, ap->a_td); switch (error) { case EOPNOTSUPP: error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); break; case 0: error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, ip->i_gid, acl, ap->a_mode, ap->a_cred, NULL); break; default: printf( "ufs_access(): Error retrieving ACL on object (%d).\n", error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); } uma_zfree(acl_zone, acl); } else #endif /* !UFS_ACL */ error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL); return (error); } /* ARGSUSED */ static int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; VI_LOCK(vp); ufs_itimes_locked(vp); if (ip->i_ump->um_fstype == UFS1) { vap->va_atime.tv_sec = ip->i_din1->di_atime; vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; } else { vap->va_atime.tv_sec = ip->i_din2->di_atime; vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; if (ip->i_ump->um_fstype == UFS1) { vap->va_rdev = ip->i_din1->di_rdev; vap->va_size = ip->i_din1->di_size; vap->va_mtime.tv_sec = ip->i_din1->di_mtime; vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din1->di_ctime; vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; vap->va_birthtime.tv_sec = 0; vap->va_birthtime.tv_nsec = 0; vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); } else { vap->va_rdev = ip->i_din2->di_rdev; vap->va_size = ip->i_din2->di_size; vap->va_mtime.tv_sec = ip->i_din2->di_mtime; vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din2->di_ctime; vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } /* * Mark for update the file's access time for vfs_mark_atime(). * We are doing this here to avoid some of the checks done * below -- this operation is done by request of the kernel and * should bypass some security checks. Things like read-only * checks get handled by other levels (e.g., ffs_update()). */ if (vap->va_vaflags & VA_MARK_ATIME) { ip->i_flag |= IN_ACCESS; return (0); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the security.jail.chflags_allowed sysctl is * is non-zero; otherwise, they behave like unprivileged * processes. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } /* Snapshot flag cannot be set or cleared */ if (((vap->va_flags & SF_SNAPSHOT) != 0 && (ip->i_flags & SF_SNAPSHOT) == 0) || ((vap->va_flags & SF_SNAPSHOT) == 0 && (ip->i_flags & SF_SNAPSHOT) != 0)) return (EPERM); ip->i_flags = vap->va_flags; DIP_SET(ip, i_flags, vap->va_flags); } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || (vap->va_flags & UF_SETTABLE) != vap->va_flags) return (EPERM); ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); DIP_SET(ip, i_flags, ip->i_flags); } ip->i_flag |= IN_CHANGE; if (vap->va_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * XXX most of the following special cases should be in * callers instead of in N filesystems. The VDIR check * mostly already is. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: /* * Truncation should have an effect in these cases. * Disallow it if the filesystem is read-only or * the file is being snapshotted. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL, cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. * * Possibly for historical reasons, try to use VADMIN in * preference to VWRITE for a NULL timestamp. This means we * will return EACCES in preference to EPERM if neither * check succeeds. */ if (vap->va_vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, cred, td); if (error) error = VOP_ACCESS(vp, VWRITE, cred, td); } else error = VOP_ACCESS(vp, VADMIN, cred, td); if (error) return (error); if (vap->va_atime.tv_sec != VNOVAL) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) ip->i_flag |= IN_CHANGE | IN_UPDATE; if (vap->va_birthtime.tv_sec != VNOVAL && ip->i_ump->um_fstype == UFS2) ip->i_flag |= IN_MODIFIED; ufs_itimes(vp); if (vap->va_atime.tv_sec != VNOVAL) { DIP_SET(ip, i_atime, vap->va_atime.tv_sec); DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec); } if (vap->va_mtime.tv_sec != VNOVAL) { DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); } if (vap->va_birthtime.tv_sec != VNOVAL && ip->i_ump->um_fstype == UFS2) { ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, td) struct vnode *vp; int mode; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. Both of these are allowed in * jail(8). */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0)) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID, 0); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); DIP_SET(ip, i_mode, ip->i_mode); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, td) struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA int i; ufs2_daddr_t change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if ((uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = DIP(ip, i_blocks); (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; DIP_SET(ip, i_gid, gid); ip->i_uid = uid; DIP_SET(ip, i_uid, uid); #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; DIP_SET(ip, i_gid, ogid); ip->i_uid = ouid; DIP_SET(ip, i_uid, ouid); if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } return (0); } static int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; struct thread *td; td = curthread; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); if (ip->i_nlink <= 0) vp->v_vflag |= VV_NOSYNC; if ((ip->i_flags & SF_SNAPSHOT) != 0) { /* * Avoid deadlock where another thread is trying to * update the inodeblock for dvp and is waiting on * snaplk. Temporary unlock the vnode lock for the * unlinked file and sync the directory. This should * allow vput() of the directory to not block later on * while holding the snapshot vnode locked, assuming * that the directory hasn't been unlinked too. */ VOP_UNLOCK(vp, 0, td); (void) VOP_FSYNC(dvp, MNT_WAIT, td); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } /* * link vnode call */ static int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; struct direct newdir; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (tdvp->v_mount != vp->v_mount) { error = EXDEV; goto out; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); } if (error) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); } out: return (error); } /* * whiteout vnode call */ static int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef INVARIANTS if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); break; case DELETE: /* remove an existing directory whiteout */ #ifdef INVARIANTS if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; struct inode *ip, *xp, *dp; struct direct newdir; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0, ioflag; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. Temporarily just warn if they do. */ if (fvp == tvp) { printf("ufs_rename: fvp == tvp (can't happen)\n"); error = 0; goto abortit; } - if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= LINK_MAX) { VOP_UNLOCK(fvp, 0, td); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0, td); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0, td); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory = 1; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)))) != 0) { VOP_UNLOCK(fvp, 0, td); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp, 0, td); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ufs_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } dp->i_effnlink++; dp->i_nlink++; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | DOINGASYNC(tdvp))); if (error) goto bad; } ufs_makedirentry(ip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); if (error) { if (doingdirectory && newparent) { dp->i_effnlink--; dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); (void)UFS_UPDATE(tdvp, 1); } goto bad; } vput(tdvp); } else { if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((dp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { if ((xp->i_effnlink > 2) || !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ufs_dirrewrite(dp, xp, ip->i_number, IFTODT(ip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) goto bad; if (doingdirectory) { if (!newparent) { dp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); } xp->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(xp); } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * Truncate inode. The only stuff left in the directory * is "." and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; } xp->i_nlink--; DIP_SET(xp, i_nlink, xp->i_nlink); xp->i_flag |= IN_CHANGE; ioflag = IO_NORMAL; if (!DOINGASYNC(tvp)) ioflag |= IO_SYNC; if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_thread)) != 0) goto bad; } vput(tdvp); vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. IN_RENAME is not sufficient * to protect against directory races due to timing windows, * so we have to remove the panic. XXX the only real way * to solve this issue is at a much higher level. By the * time we hit ufs_rename() it's too late. */ #if 0 if (doingdirectory) panic("ufs_rename: lost dir entry"); #endif vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; the IN_RENAME * flag ensures that it cannot be moved by another rename or removed * by a rmdir. */ if (xp != ip) { /* * From name resolves to a different inode. IN_RENAME is * not sufficient protection against timing window races * so we can't panic here. XXX the only real way * to solve this issue is at a much higher level. By the * time we hit ufs_rename() it's too late. */ #if 0 if (doingdirectory) panic("ufs_rename: lost dir entry"); #endif } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { xp->i_offset = mastertemplate.dot_reclen; ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE, td) == 0) { + if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); vput(fvp); } else vrele(fvp); return (error); } /* * Mkdir system call */ static int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; #ifdef UFS_ACL struct acl *acl, *dacl; #endif int error, dmode; long blkoff; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; DIP_SET(ip, i_gid, dp->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; DIP_SET(ip, i_uid, dp->i_uid); #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; #ifdef UFS_ACL acl = dacl = NULL; if ((dvp->v_mount->mnt_flag & MNT_ACLS) != 0) { acl = uma_zalloc(acl_zone, M_WAITOK); dacl = uma_zalloc(acl_zone, M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred, cnp->cn_thread); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); uma_zfree(acl_zone, acl); uma_zfree(acl_zone, dacl); dacl = acl = NULL; break; default: UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); uma_zfree(acl_zone, acl); uma_zfree(acl_zone, dacl); return (error); } } else { #endif /* !UFS_ACL */ ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); #ifdef UFS_ACL } #endif tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (acl != NULL) { /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred, cnp->cn_thread); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cnp->cn_cred, cnp->cn_thread); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: uma_zfree(acl_zone, acl); uma_zfree(acl_zone, dacl); dacl = acl = NULL; goto bad; } uma_zfree(acl_zone, acl); uma_zfree(acl_zone, dacl); dacl = acl = NULL; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, BA_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; DIP_SET(ip, i_size, DIRBLKSIZ); ip->i_flag |= IN_CHANGE | IN_UPDATE; vnode_pager_setsize(tvp, (u_long)ip->i_size); bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp)))) != 0) { (void)bwrite(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); bad: if (error == 0) { *ap->a_vpp = tvp; } else { #ifdef UFS_ACL if (acl != NULL) uma_zfree(acl_zone, acl); if (dacl != NULL) uma_zfree(acl_zone, dacl); #endif dp->i_effnlink--; dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); vput(tvp); } out: return (error); } /* * Rmdir system call. */ static int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error, ioflag; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) { error = EINVAL; goto out; } if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } goto out; } cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we are quashing * it. The soft dependency code will arrange to do these operations * after the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; ioflag = IO_NORMAL; if (!DOINGASYNC(vp)) ioflag |= IO_SYNC; error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, cnp->cn_thread); } cache_purge(vp); #ifdef UFS_DIRHASH /* Kill any active hash; i_effnlink == 0, so it will not come back. */ if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: return (error); } /* * symlink -- make a symbolic link */ static int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, SHORTLINK(ip), len); ip->i_size = len; DIP_SET(ip, i_size, len); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, (int *)0, (struct thread *)0); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. * * The routine below assumes that the on-disk format of a directory * is the same as that defined by . If the on-disk * format changes, then it will be necessary to do a conversion * from the on-disk format that read returns to the format defined * by . */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct uio *uio = ap->a_uio; int error; size_t count, lost; off_t off; if (ap->a_ncookies != NULL) /* * Ensure that the block is aligned. The caller can use * the cookies to determine where in the block to start. */ uio->uio_offset &= ~(DIRBLKSIZ - 1); off = uio->uio_offset; count = uio->uio_resid; /* Make sure we don't return partial entries. */ if (count <= ((uio->uio_offset + count) & (DIRBLKSIZ -1))) return (EINVAL); count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); lost = uio->uio_resid - count; uio->uio_resid = count; uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); } else { struct dirent *dp, *edp; struct uio auio; struct iovec aiov; caddr_t dirbuf; int readcnt; u_char tmp; auio = *uio; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { tmp = dp->d_namlen; dp->d_namlen = dp->d_type; dp->d_type = tmp; if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, uio); } FREE(dirbuf, M_TEMP); } # else error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dpEnd; struct dirent* dp; int ncookies; u_long *cookies; u_long *cookiep; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ufs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((char *)uio->uio_iov->iov_base - (uio->uio_offset - off)); dpEnd = (struct dirent *) uio->uio_iov->iov_base; for (dp = dpStart, ncookies = 0; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) ncookies++; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { off += dp->d_reclen; *cookiep++ = (u_long) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } uio->uio_resid += lost; if (ap->a_eofflag) *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ static int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); doff_t isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */ return (uiomove(SHORTLINK(ip), isize, ap->a_uio)); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ufs_bmaparray() operation may not * deadlock on memory. See ufs_bmap() for details. */ static int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct bufobj *bo; struct inode *ip; ufs2_daddr_t blkno; int error; ip = VTOI(vp); if (bp->b_blkno == bp->b_lblkno) { error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = ip->i_umbufobj; BO_STRATEGY(bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); printf("\tino %lu, on dev %s", (u_long)ip->i_number, devtoname(ip->i_dev)); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ufs kqfilter routines if needed */ static int ufsfifo_kqfilter(ap) struct vop_kqfilter_args *ap; { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ static int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { int error; error = 0; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; break; case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; case _PC_ACL_EXTENDED: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; #else *ap->a_retval = 0; #endif break; case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif break; case _PC_MAC_PRESENT: #ifdef MAC if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) *ap->a_retval = 1; else *ap->a_retval = 0; #else *ap->a_retval = 0; #endif break; case _PC_ASYNC_IO: /* _PC_ASYNC_IO should have been handled by upper layers. */ KASSERT(0, ("_PC_ASYNC_IO should not get here")); error = EINVAL; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = EINVAL; break; } return (error); } /* * Advisory record locking support */ static int ufs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { struct inode *ip = VTOI(ap->a_vp); return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, fifoops, vpp) struct mount *mntp; struct vop_vector *fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); if (vp->v_type == VFIFO) vp->v_op = fifoops; ASSERT_VOP_LOCKED(vp, "ufs_vinit"); if (ip->i_number == ROOTINO) vp->v_vflag |= VV_ROOT; ip->i_modrev = init_va_filerev(); *vpp = vp; return (0); } /* * Allocate a new inode. * Vnode dvp must be locked. */ static int ufs_makeinode(mode, dvp, vpp, cnp) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; #ifdef UFS_ACL struct acl *acl; #endif int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; DIP_SET(ip, i_gid, pdir->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; DIP_SET(ip, i_uid, ip->i_uid); mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; #ifdef UFS_ACL acl = NULL; if ((dvp->v_mount->mnt_flag & MNT_ACLS) != 0) { acl = uma_zalloc(acl_zone, M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred, cnp->cn_thread); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; DIP_SET(ip, i_mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; DIP_SET(ip, i_mode, mode); uma_zfree(acl_zone, acl); acl = NULL; break; default: UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); uma_zfree(acl_zone, acl); acl = NULL; return (error); } } else { #endif ip->i_mode = mode; DIP_SET(ip, i_mode, mode); #ifdef UFS_ACL } #endif tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) { ip->i_mode &= ~ISGID; DIP_SET(ip, i_mode, ip->i_mode); } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp))); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (acl != NULL) { /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred, cnp->cn_thread); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_makeinode: VOP_GETACL() but no " "VOP_SETACL()\n"); /* panic("ufs_makeinode: VOP_GETACL() but no " "VOP_SETACL()"); */ break; default: uma_zfree(acl_zone, acl); goto bad; } uma_zfree(acl_zone, acl); } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); vput(tvp); return (error); } /* Global vfs data structures for ufs. */ struct vop_vector ufs_vnodeops = { .vop_default = &default_vnodeops, .vop_fsync = VOP_PANIC, .vop_read = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_write = VOP_PANIC, .vop_access = ufs_access, .vop_advlock = ufs_advlock, .vop_bmap = ufs_bmap, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, .vop_create = ufs_create, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_link = ufs_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = ufs_mkdir, .vop_mknod = ufs_mknod, .vop_open = ufs_open, .vop_pathconf = ufs_pathconf, .vop_poll = vop_stdpoll, .vop_print = ufs_print, .vop_readdir = ufs_readdir, .vop_readlink = ufs_readlink, .vop_reclaim = ufs_reclaim, .vop_remove = ufs_remove, .vop_rename = ufs_rename, .vop_rmdir = ufs_rmdir, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_strategy = ufs_strategy, .vop_symlink = ufs_symlink, .vop_whiteout = ufs_whiteout, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; struct vop_vector ufs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_access = ufs_access, .vop_close = ufsfifo_close, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_kqfilter = ufsfifo_kqfilter, .vop_print = ufs_print, .vop_read = VOP_PANIC, .vop_reclaim = ufs_reclaim, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_write = VOP_PANIC, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 175201) +++ head/sys/vm/swap_pager.c (revision 175202) @@ -1,2543 +1,2543 @@ /*- * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_swap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, or 16 * pages per allocation. We recommend you stick with the default of 8. * The 16-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 16 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif /* * Piecemeal swap metadata structure. Swap is stored in a radix tree. * * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix * is basically 8. Assuming PAGE_SIZE == 4096, one tree level represents * 32K worth of data, two levels represent 256K, three levels represent * 2 MBytes. This is acceptable. * * Overall memory utilization is about the same as the old swap structure. */ #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) #define SWAP_META_PAGES (SWB_NPAGES * 2) #define SWAP_META_MASK (SWAP_META_PAGES - 1) struct swblock { struct swblock *swb_hnext; vm_object_t swb_object; vm_pindex_t swb_index; int swb_count; daddr_t swb_pages[SWAP_META_PAGES]; }; static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static int swdev_syscall_active = 0; /* serialize swap(on|off) */ static void swapdev_strategy(struct buf *, struct swdevt *sw); #define SWM_FREE 0x02 /* free, period */ #define SWM_POP 0x04 /* pop out */ int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static int nsw_rcount; /* free read buffers */ static int nsw_wcount_sync; /* limit write buffers / synchronous */ static int nsw_wcount_async; /* limit write buffers / asynchronous */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static struct swblock **swhash; static int swhash_mask; static struct mtx swhash_mtx; static int swap_async_max = 4; /* maximum in-progress async I/O's */ static struct sx sw_alloc_sx; SYSCTL_INT(_vm, OID_AUTO, swap_async_max, CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops"); /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct mtx sw_alloc_mtx; /* protect list manipulation */ static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swap_zone; static struct vm_object swap_zone_obj; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ }; /* * dmmax is in page-sized chunks with the new swap system. It was * dev-bsized chunks in the old. dmmax is always a power of 2. * * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int dmmax; static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static int swapongeom(struct thread *, struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, int npages); static daddr_t swp_pager_getswapspace(int npages); /* * Metadata functions */ static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. * This routine must be called at splvm() */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWP_PAGER_HASH() - hash swap meta data * * This is an helper function which hashes the swapblk given * the object and page index. It returns a pointer to a pointer * to the object, or a pointer to a NULL pointer if it could not * find a swapblk. * * This routine must be called at splvm(). */ static struct swblock ** swp_pager_hash(vm_object_t object, vm_pindex_t index) { struct swblock **pswap; struct swblock *swap; index &= ~(vm_pindex_t)SWAP_META_MASK; pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; while ((swap = *pswap) != NULL) { if (swap->swb_object == object && swap->swb_index == index ) { break; } pswap = &swap->swb_hnext; } return (pswap); } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); /* * Device Stripe, in PAGE_SIZE'd blocks */ dmmax = SWB_NPAGES * 2; } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { int n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array (MAXPHYS/PAGE_SIZE) and our locally defined * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); mtx_lock(&pbuf_mtx); nsw_rcount = (nswbuf + 1) / 2; nsw_wcount_sync = (nswbuf + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_unlock(&pbuf_mtx); /* * Initialize our zone. Right now I'm just guessing on the number * we need based on the number of pages in the system. Each swblock * can hold 16 pages, so this is probably overkill. This reservation * is typically limited to around 32MB by default. */ n = cnt.v_page_count / 2; if (maxswzone && n > maxswzone / sizeof(struct swblock)) n = maxswzone / sizeof(struct swblock); n2 = n; swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); if (swap_zone == NULL) panic("failed to create swap_zone."); do { if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); if (n2 != n) printf("Swap zone entries reduced from %d to %d.\n", n2, n); n2 = n; /* * Initialize our meta-data hash table. The swapper does not need to * be quite as efficient as the VM system, so we do not use an * oversized hash table. * * n: size of hash table, must be power of 2 * swhash_mask: hash table index mask */ for (n = 1; n < n2 / 8; n *= 2) ; swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); swhash_mask = n - 1; mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object * and then converting it with swp_pager_meta_build(). * * This routine may block in vm_object_allocate() and create a named * object lookup race, so we must interlock. We must also run at * splvm() for the object lookup to handle races with interrupts, but * we do not have to maintain splvm() in between the lookup and the * add because (I believe) it is not possible to attempt to create * a new swap object w/handle when a default object with that handle * already exists. * * MPSAFE */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset) { vm_object_t object; vm_pindex_t pindex; pindex = OFF_TO_IDX(offset + PAGE_MASK + size); if (handle) { mtx_lock(&Giant); /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = vm_object_allocate(OBJT_DEFAULT, pindex); object->handle = handle; VM_OBJECT_LOCK(object); swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_UNLOCK(object); } sx_xunlock(&sw_alloc_sx); mtx_unlock(&Giant); } else { object = vm_object_allocate(OBJT_DEFAULT, pindex); VM_OBJECT_LOCK(object); swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_UNLOCK(object); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * This routine may block, but no longer does. * * The object must be locked or unreferenceable. */ static void swap_pager_dealloc(vm_object_t object) { /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { mtx_lock(&sw_alloc_mtx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); mtx_unlock(&sw_alloc_mtx); } VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for the requested number of pages. The starting * swap block number (a page index) is returned or SWAPBLK_NONE * if the allocation failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * Must be called at splvm() to avoid races with bitmap frees from * vm_page_remove() aka swap_pager_page_removed(). * * This routine may not block * This routine must be called at splvm(). * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int npages) { daddr_t blk; struct swdevt *sp; int i; blk = SWAPBLK_NONE; mtx_lock(&sw_dev_mtx); sp = swdevhd; for (i = 0; i < nswapdev; i++) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if (!(sp->sw_flags & SW_CLOSING)) { blk = blist_alloc(sp->sw_blist, npages); if (blk != SWAPBLK_NONE) { blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); goto done; } } sp = TAILQ_NEXT(sp, sw_list); } if (swap_pager_full != 2) { printf("swap_pager_getswapspace(%d): failed\n", npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; done: mtx_unlock(&sw_dev_mtx); return (blk); } static int swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { mtx_unlock(&sw_dev_mtx); sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * Note: This routine may not block (it could in the old swap code), * and through the use of the new blist routines it does not block. * * We must be called at splvm() to avoid races with bitmap frees from * vm_page_remove() aka swap_pager_page_removed(). * * This routine may not block * This routine must be called at splvm(). */ static void swp_pager_freeswapspace(daddr_t blk, int npages) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (blk >= sp->sw_first && blk < sp->sw_end) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * This routine may be called at any spl. We up our spl to splvm temporarily * in order to perform the metadata removal. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zerod. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { int n = 0; daddr_t blk = SWAPBLK_NONE; vm_pindex_t beg = start; /* save start index */ VM_OBJECT_LOCK(object); while (size) { if (n == 0) { n = BLIST_MAX_ALLOC; while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { n >>= 1; if (n == 0) { swp_pager_meta_free(object, beg, start - beg); VM_OBJECT_UNLOCK(object); return (-1); } } } swp_pager_meta_build(object, start, blk); --size; ++start; ++blk; --n; } swp_pager_meta_free(object, start, n); VM_OBJECT_UNLOCK(object); return (0); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to block. It may block allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * This routine can be called at any spl * * XXX vm_page_collapse() kinda expects us not to block because we * supposedly do not need to allocate memory, but for the moment we * *may* have to get a little memory from the zone allocator, but * it is taken from the interrupt memory. We should be ok. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked or * inaccessible (XXX are they ?) */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { vm_pindex_t i; VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED); VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource) { if (srcobject->handle != NULL) { mtx_lock(&sw_alloc_mtx); TAILQ_REMOVE( NOBJLIST(srcobject->handle), srcobject, pager_object_list ); mtx_unlock(&sw_alloc_mtx); } } /* * transfer source to destination. */ for (i = 0; i < dstobject->size; ++i) { daddr_t dstaddr; /* * Locate (without changing) the swapblk on the destination, * unless it is invalid in which case free it silently, or * if the destination is a resident page, in which case the * source is thrown away. */ dstaddr = swp_pager_meta_ctl(dstobject, i, 0); if (dstaddr == SWAPBLK_NONE) { /* * Destination has no swapblk and is not resident, * copy source. */ daddr_t srcaddr; srcaddr = swp_pager_meta_ctl( srcobject, i + offset, SWM_POP ); if (srcaddr != SWAPBLK_NONE) { /* * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_UNLOCK(srcobject); vm_object_pip_add(dstobject, 1); swp_pager_meta_build(dstobject, i, srcaddr); vm_object_pip_wakeup(dstobject); VM_OBJECT_LOCK(srcobject); vm_object_pip_wakeup(srcobject); } } else { /* * Destination has valid swapblk or it is represented * by a resident page. We destroy the sourceblock. */ swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); } } /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidently * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page within a reasonable * distance. We do not try to restrict it to the swap device stripe * (that is handled in getpages/putpages). It probably isn't worth * doing here. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk0; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { int i; for (i = 1; i < (SWB_NPAGES/2); ++i) { daddr_t blk; if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); if (blk != blk0 - i) break; } *before = (i - 1); } /* * find forward-looking contiguous good backing store */ if (after != NULL) { int i; for (i = 1; i < (SWB_NPAGES/2); ++i) { daddr_t blk; blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } *after = (i - 1); } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not block * This routine must be called at splvm() */ static void swap_pager_unswapped(vm_page_t m) { VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); } /* * SWAP_PAGER_GETPAGES() - bring pages in from swap * * Attempt to retrieve (m, count) pages from backing store, but make * sure we retrieve at least m[reqpage]. We try to load in as large * a chunk surrounding m[reqpage] as is contiguous in swap and which * belongs to the same object. * * The code is designed for asynchronous operation and * immediate-notification of 'reqpage' but tends not to be * used that way. Please do not optimize-out this algorithmic * feature, I intend to improve on it in the future. * * The parent has a single vm_object_pip_add() reference prior to * calling us and we should return with the same. * * The parent has BUSY'd the pages. We should return with 'm' * left busy, but the others adjusted. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) { struct buf *bp; vm_page_t mreq; int i; int j; daddr_t blk; mreq = m[reqpage]; KASSERT(mreq->object == object, ("swap_pager_getpages: object mismatch %p/%p", object, mreq->object)); /* * Calculate range to retrieve. The pages have already been assigned * their swapblks. We require a *contiguous* range but we know it to * not span devices. If we do not supply it, bad things * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the * loops are set up such that the case(s) are handled implicitly. * * The swp_*() calls must be made at splvm(). vm_page_free() does * not need to be, but it will go a little faster if it is. */ blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); for (i = reqpage - 1; i >= 0; --i) { daddr_t iblk; iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); if (blk != iblk + (reqpage - i)) break; } ++i; for (j = reqpage + 1; j < count; ++j) { daddr_t jblk; jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); if (blk != jblk - (j - reqpage)) break; } /* * free pages outside our collection range. Note: we never free * mreq, it must remain busy throughout. */ if (0 < i || j < count) { int k; vm_page_lock_queues(); for (k = 0; k < i; ++k) vm_page_free(m[k]); for (k = j; k < count; ++k) vm_page_free(m[k]); vm_page_unlock_queues(); } /* * Return VM_PAGER_FAIL if we have nothing to do. Return mreq * still busy, but the others unbusied. */ if (blk == SWAPBLK_NONE) return (VM_PAGER_FAIL); /* * Getpbuf() can sleep. */ VM_OBJECT_UNLOCK(object); /* * Get a swap buffer header to perform the IO */ bp = getpbuf(&nsw_rcount); bp->b_flags |= B_PAGING; /* * map our page(s) into kva for input */ pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i); bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk - (reqpage - i); bp->b_bcount = PAGE_SIZE * (j - i); bp->b_bufsize = PAGE_SIZE * (j - i); bp->b_pager.pg_reqpage = reqpage - i; VM_OBJECT_LOCK(object); { int k; for (k = i; k < j; ++k) { bp->b_pages[k - i] = m[k]; m[k]->oflags |= VPO_SWAPINPROG; } } bp->b_npages = j - i; PCPU_INC(cnt.v_swapin); PCPU_ADD(cnt.v_swappgsin, bp->b_npages); /* * We still hold the lock on mreq, and our automatic completion routine * does not remove it. */ vm_object_pip_add(object, bp->b_npages); VM_OBJECT_UNLOCK(object); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our m[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * wait for the page we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the meta-data. */ VM_OBJECT_LOCK(object); while ((mreq->oflags & VPO_SWAPINPROG) != 0) { mreq->oflags |= VPO_WANTED; vm_page_lock_queues(); vm_page_flag_set(mreq, PG_REFERENCED); vm_page_unlock_queues(); PCPU_INC(cnt.v_intrans); if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * mreq is left busied after completion, but all the other pages * are freed. If we had an unrecoverable read error the page will * not be valid. */ if (mreq->valid != VM_PAGE_BITS_ALL) { return (VM_PAGER_ERROR); } else { return (VM_PAGER_OK); } /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ void swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals) { int i; int n = 0; if (count && m[0]->object != object) { panic("swap_pager_putpages: object mismatch %p/%p", object, m[0]->object ); } /* * Step 1 * * Turn object into OBJT_SWAP * check for bogus sysops * force sync if not pageout process */ if (object->type != OBJT_SWAP) swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_UNLOCK(object); if (curproc != pageproc) sync = TRUE; /* * Step 2 * * Update nsw parameters from swap_async_max sysctl values. * Do not let the sysop crash the machine with bogus numbers. */ mtx_lock(&pbuf_mtx); if (swap_async_max != nsw_wcount_async_max) { int n; /* * limit range */ if ((n = swap_async_max) > nswbuf / 2) n = nswbuf / 2; if (n < 1) n = 1; swap_async_max = n; /* * Adjust difference ( if possible ). If the current async * count is too low, we may not be able to make the adjustment * at this time. */ n -= nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } } mtx_unlock(&pbuf_mtx); /* * Step 3 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { int j; struct buf *bp; daddr_t blk; /* * Maximum I/O size is limited by a number of factors. */ n = min(BLIST_MAX_ALLOC, count - i); n = min(n, nsw_cluster_max); /* * Get biggest block of swap we can. If we fail, fall * back and try to allocate a smaller block. Don't go * overboard trying to allocate space if it would overly * fragment swap. */ while ( (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && n > 4 ) { n >>= 1; } if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_FAIL; continue; } /* * All I/O parameters have been satisfied, build the I/O * request and assign the swap space. */ if (sync == TRUE) { bp = getpbuf(&nsw_wcount_sync); } else { bp = getpbuf(&nsw_wcount_async); bp->b_flags = B_ASYNC; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_LOCK(object); for (j = 0; j < n; ++j) { vm_page_t mreq = m[i+j]; swp_pager_meta_build( mreq->object, mreq->pindex, blk + j ); vm_page_dirty(mreq); rtvals[i+j] = VM_PAGER_OK; mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_UNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; PCPU_INC(cnt.v_swapout); PCPU_ADD(cnt.v_swappgsout, bp->b_npages); /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ if (sync == FALSE) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_PEND; /* restart outter loop */ continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete, then update rtvals. * We just set the rtvals[] to VM_PAGER_PEND so we can call * our async completion routine at the end, thus avoiding a * double-free. */ bwait(bp, PVM, "swwrt"); for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_PEND; /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } VM_OBJECT_LOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * For READ operations, the pages are PG_BUSY'd. For WRITE operations, * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY * unbusy all pages except the 'main' request page. For WRITE * operations, we vm_page_t->busy'd unbusy all pages ( we can do this * because we marked them all VM_PAGER_PEND on return from putpages ). * * This routine may not block. * This routine is called at splbio() or better * * We up ourselves to splvm() as required for various vm_page related * calls. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * report error */ if (bp->b_ioflags & BIO_ERROR) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_LOCK(object); } vm_page_lock_queues(); /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * When reading, reqpage needs to stay * locked for the parent, but all other * pages can be freed. We still want to * wakeup the parent waiting on the page, * though. ( also: pg_reqpage can be -1 and * not match anything ). * * We have to wake specifically requested pages * up too because we cleared VPO_SWAPINPROG and * someone may be waiting for that. * * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; if (i != bp->b_pager.pg_reqpage) vm_page_free(m); else vm_page_flash(m); /* * If i == bp->b_pager.pg_reqpage, do not wake * the page up. The caller needs to. */ } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ vm_page_dirty(m); vm_page_activate(m); vm_page_io_finish(m); } } else if (bp->b_iocmd == BIO_READ) { /* * For read success, clear dirty bits. Nobody should * have this page mapped but don't take any chances, * make sure the pmap modify bits are also cleared. * * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. * * If not the requested page then deactivate it. * * Note that the requested page, reqpage, is left * busied, but we still have to wake it up. The * other pages are released (unbusied) by * vm_page_wakeup(). We do not set reqpage's * valid bits here, it is up to the caller. */ pmap_clear_modify(m); m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); /* * We have to wake specifically requested pages * up too because we cleared VPO_SWAPINPROG and * could be waiting for it in getpages. However, * be sure to not unbusy getpages specifically * requested page - getpages expects it to be * left busy. */ if (i != bp->b_pager.pg_reqpage) { vm_page_deactivate(m); vm_page_wakeup(m); } else { vm_page_flash(m); } } else { /* * For write success, clear the modify and dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). */ pmap_clear_modify(m); vm_page_undirty(m); vm_page_io_finish(m); if (vm_page_count_severe()) vm_page_try_to_cache(m); } } vm_page_unlock_queues(); /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_UNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ relpbuf( bp, ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : ((bp->b_flags & B_ASYNC) ? &nsw_wcount_async : &nsw_wcount_sync ) ) ); } /* * swap_pager_isswapped: * * Return 1 if at least one page in the given object is paged * out to the given swap device. * * This routine may not block. */ int swap_pager_isswapped(vm_object_t object, struct swdevt *sp) { daddr_t index = 0; int bcount; int i; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_SWAP) return (0); mtx_lock(&swhash_mtx); for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { struct swblock *swap; if ((swap = *swp_pager_hash(object, index)) != NULL) { for (i = 0; i < SWAP_META_PAGES; ++i) { if (swp_pager_isondev(swap->swb_pages[i], sp)) { mtx_unlock(&swhash_mtx); return (1); } } } index += SWAP_META_PAGES; if (index > 0x20000000) panic("swap_pager_isswapped: failed to locate all swap meta blocks"); } mtx_unlock(&swhash_mtx); return (0); } /* * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * * This routine dissociates the page at the given index within a * swap block from its backing store, paging it in if necessary. * If the page is paged in, it is placed in the inactive queue, * since it had its backing store ripped out from under it. * We also attempt to swap in all other pages in the swap block, * we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we * revert to the one-by-one behavior for now. Sigh. */ static inline void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; vm_object_pip_add(object, 1); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY); if (m->valid == VM_PAGE_BITS_ALL) { vm_object_pip_subtract(object, 1); vm_page_lock_queues(); vm_page_activate(m); vm_page_dirty(m); vm_page_unlock_queues(); vm_page_wakeup(m); vm_pager_page_unswapped(m); return; } if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK) panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ vm_object_pip_subtract(object, 1); vm_page_lock_queues(); vm_page_dirty(m); vm_page_dontneed(m); vm_page_unlock_queues(); vm_page_wakeup(m); vm_pager_page_unswapped(m); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { struct swblock *swap; int i, j, retries; GIANT_REQUIRED; retries = 0; full_rescan: mtx_lock(&swhash_mtx); for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ restart: for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { vm_object_t object = swap->swb_object; vm_pindex_t pindex = swap->swb_index; for (j = 0; j < SWAP_META_PAGES; ++j) { if (swp_pager_isondev(swap->swb_pages[j], sp)) { /* avoid deadlock */ if (!VM_OBJECT_TRYLOCK(object)) { break; } else { mtx_unlock(&swhash_mtx); swp_pager_force_pagein(object, pindex + j); VM_OBJECT_UNLOCK(object); mtx_lock(&swhash_mtx); goto restart; } } } } } mtx_unlock(&swhash_mtx); if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. All swp_*() routines must be called at * splvm() because swap can be freed up by the low level vm_page * code which might be called from interrupts beyond what splbio() covers. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is freed. * * This routine must be called at splvm(), except when used to convert * an OBJT_DEFAULT object into an OBJT_SWAP object. */ static void swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { struct swblock *swap; struct swblock **pswap; int idx; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { object->type = OBJT_SWAP; object->un_pager.swp.swp_bcount = 0; if (object->handle != NULL) { mtx_lock(&sw_alloc_mtx); TAILQ_INSERT_TAIL( NOBJLIST(object->handle), object, pager_object_list ); mtx_unlock(&sw_alloc_mtx); } } /* * Locate hash entry. If not found create, but if we aren't adding * anything just return. If we run out of space in the map we wait * and, since the hash table may have changed, retry. */ retry: mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) == NULL) { int i; if (swapblk == SWAPBLK_NONE) goto done; swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT); if (swap == NULL) { mtx_unlock(&swhash_mtx); VM_OBJECT_UNLOCK(object); if (uma_zone_exhausted(swap_zone)) printf("swap zone exhausted, increase kern.maxswzone\n"); VM_WAIT; VM_OBJECT_LOCK(object); goto retry; } swap->swb_hnext = NULL; swap->swb_object = object; swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; swap->swb_count = 0; ++object->un_pager.swp.swp_bcount; for (i = 0; i < SWAP_META_PAGES; ++i) swap->swb_pages[i] = SWAPBLK_NONE; } /* * Delete prior contents of metadata */ idx = pindex & SWAP_META_MASK; if (swap->swb_pages[idx] != SWAPBLK_NONE) { swp_pager_freeswapspace(swap->swb_pages[idx], 1); --swap->swb_count; } /* * Enter block into metadata */ swap->swb_pages[idx] = swapblk; if (swapblk != SWAPBLK_NONE) ++swap->swb_count; done: mtx_unlock(&swhash_mtx); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. * * This routine must be called at splvm() */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_SWAP) return; while (count > 0) { struct swblock **pswap; struct swblock *swap; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, index); if ((swap = *pswap) != NULL) { daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; if (v != SWAPBLK_NONE) { swp_pager_freeswapspace(v, 1); swap->swb_pages[index & SWAP_META_MASK] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } } --count; ++index; } else { int n = SWAP_META_PAGES - (index & SWAP_META_MASK); count -= n; index += n; } mtx_unlock(&swhash_mtx); } } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. * * This routine must be called at splvm() */ static void swp_pager_meta_free_all(vm_object_t object) { daddr_t index = 0; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_SWAP) return; while (object->un_pager.swp.swp_bcount) { struct swblock **pswap; struct swblock *swap; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, index); if ((swap = *pswap) != NULL) { int i; for (i = 0; i < SWAP_META_PAGES; ++i) { daddr_t v = swap->swb_pages[i]; if (v != SWAPBLK_NONE) { --swap->swb_count; swp_pager_freeswapspace(v, 1); } } if (swap->swb_count != 0) panic("swap_pager_meta_free_all: swb_count != 0"); *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } mtx_unlock(&swhash_mtx); index += SWAP_META_PAGES; if (index > 0x20000000) panic("swp_pager_meta_free_all: failed to locate all swap meta blocks"); } } /* * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. * * This routine is capable of looking up, popping, or freeing * swapblk assignments in the swap meta data or in the vm_page_t. * The routine typically returns the swapblk being looked-up, or popped, * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block * was invalid. This routine will automatically free any invalid * meta-data swapblks. * * It is not possible to store invalid swapblks in the swap meta data * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. * * This routine must be called at splvm(). * * SWM_FREE remove and free swap block from metadata * SWM_POP remove from meta data but do not free.. pop it out */ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { struct swblock **pswap; struct swblock *swap; daddr_t r1; int idx; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * The meta data only exists of the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); r1 = SWAPBLK_NONE; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) != NULL) { idx = pindex & SWAP_META_MASK; r1 = swap->swb_pages[idx]; if (r1 != SWAPBLK_NONE) { if (flags & SWM_FREE) { swp_pager_freeswapspace(r1, 1); r1 = SWAPBLK_NONE; } if (flags & (SWM_FREE|SWM_POP)) { swap->swb_pages[idx] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } } } } mtx_unlock(&swhash_mtx); return (r1); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); mtx_lock(&Giant); while (swdev_syscall_active) tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); swdev_syscall_active = 1; /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swap_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(td, vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: swdev_syscall_active = 0; wakeup_one(&swdev_syscall_active); mtx_unlock(&Giant); return (error); } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n", mblocks); nblks = mblocks; } /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_flags = 0; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_blist = blist_create(nblks); /* * Do not free the first two block in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, 2, nblks - 2); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks; swp_sizecheck(); mtx_unlock(&sw_dev_mtx); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); mtx_lock(&Giant); while (swdev_syscall_active) tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); swdev_syscall_active = 1; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: swdev_syscall_active = 0; wakeup_one(&swdev_syscall_active); mtx_unlock(&Giant); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks, dvbase; #ifdef MAC int error; #endif mtx_assert(&Giant, MA_OWNED); #ifdef MAC - (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY, curthread); + (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0, curthread); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail < nblks + nswap_lowat) { return (ENOMEM); } /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) { swap_pager_avail -= blist_fill(sp->sw_blist, dvbase, dmmax); } mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); sp->sw_id = NULL; mtx_lock(&sw_dev_mtx); TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; mtx_lock(&Giant); while (swdev_syscall_active) tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); swdev_syscall_active = 1; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = sp->sw_vp->v_rdev->si_name; else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); swdev_syscall_active = 0; wakeup_one(&swdev_syscall_active); mtx_unlock(&Giant); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; int error, n; struct xswdev xs; struct swdevt *sp; if (arg2 != 1) /* name length */ return (EINVAL); n = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n == *name) { mtx_unlock(&sw_dev_mtx); xs.xsw_version = XSWDEV_VERSION; xs.xsw_dev = sp->sw_dev; xs.xsw_flags = sp->sw_flags; xs.xsw_nblks = sp->sw_nblks; xs.xsw_used = sp->sw_used; error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } n++; } mtx_unlock(&sw_dev_mtx); return (ENOENT); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info, "Swap statistics by device"); /* * vmspace_swap_count() - count the approximate swap useage in pages for a * vmspace. * * The map must be locked. * * Swap useage is determined by taking the proportional swap used by * VM objects backing the VM map. To make up for fractional losses, * if the VM object has any swap use at all the associated map entries * count for at least 1 swap page. */ int vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map = &vmspace->vm_map; vm_map_entry_t cur; int count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { vm_object_t object; if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object = cur->object.vm_object) != NULL) { VM_OBJECT_LOCK(object); if (object->type == OBJT_SWAP && object->un_pager.swp.swp_bcount != 0) { int n = (cur->end - cur->start) / PAGE_SIZE; count += object->un_pager.swp.swp_bcount * SWAP_META_PAGES * n / object->size + 1; } VM_OBJECT_UNLOCK(object); } } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_done(struct bio *bp2) { struct buf *bp; bp = bp2->bio_caller2; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bufdone(bp); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; cp = sp->sw_id; if (cp == NULL) { bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } bio = g_alloc_bio(); #if 0 /* * XXX: We shouldn't really sleep here when we run out of buffers * XXX: but the alternative is worse right now. */ if (bio == NULL) { bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } #endif bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_data = bp->b_data; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) if (sp->sw_id == cp) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); } static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { /* XXX: direct call when Giant untangled */ g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL); } struct swh0h0 { struct cdev *dev; struct vnode *vp; int error; }; static void swapongeom_ev(void *arg, int flags) { struct swh0h0 *swh; struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; swh = arg; swh->error = 0; pp = g_dev_getprovider(swh->dev); if (pp == NULL) { swh->error = ENODEV; return; } mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); swh->error = EBUSY; return; } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap", NULL); cp = g_new_consumer(gp); g_attach(cp, pp); /* * XXX: Everytime you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error) { g_detach(cp); g_destroy_consumer(cp); swh->error = error; return; } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(swh->dev)); swh->error = 0; return; } static int swapongeom(struct thread *td, struct vnode *vp) { int error; struct swh0h0 swh; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); swh.dev = vp->v_rdev; swh.vp = vp; swh.error = 0; /* XXX: direct call when Giant untangled */ error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL); if (!error) error = swh.error; VOP_UNLOCK(vp, 0, td); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); - (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp, 0, td); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV); return (0); } Index: head/sys/vm/vm_contig.c =================================================================== --- head/sys/vm/vm_contig.c (revision 175201) +++ head/sys/vm/vm_contig.c (revision 175202) @@ -1,289 +1,289 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vm_contig_launder_page(vm_page_t m, vm_page_t *next) { vm_object_t object; vm_page_t m_tmp; struct vnode *vp; struct mount *mp; int vfslocked; mtx_assert(&vm_page_queue_mtx, MA_OWNED); object = m->object; if (!VM_OBJECT_TRYLOCK(object) && !vm_pageout_fallback_object_lock(m, next)) { VM_OBJECT_UNLOCK(object); return (EAGAIN); } if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) { VM_OBJECT_UNLOCK(object); vm_page_lock_queues(); return (EBUSY); } vm_page_test_dirty(m); if (m->dirty == 0 && m->hold_count == 0) pmap_remove_all(m); if (m->dirty) { if ((object->flags & OBJ_DEAD) != 0) { VM_OBJECT_UNLOCK(object); return (EAGAIN); } if (object->type == OBJT_VNODE) { vm_page_unlock_queues(); vp = object->handle; vm_object_reference_locked(object); VM_OBJECT_UNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_LOCK(object); vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0, curthread); VFS_UNLOCK_GIANT(vfslocked); vm_object_deallocate(object); vn_finished_write(mp); vm_page_lock_queues(); return (0); } else if (object->type == OBJT_SWAP || object->type == OBJT_DEFAULT) { m_tmp = m; vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC); VM_OBJECT_UNLOCK(object); return (0); } } else if (m->hold_count == 0) vm_page_cache(m); VM_OBJECT_UNLOCK(object); return (0); } static int vm_contig_launder(int queue) { vm_page_t m, next; int error; TAILQ_FOREACH_SAFE(m, &vm_page_queues[queue].pl, pageq, next) { /* Skip marker pages */ if ((m->flags & PG_MARKER) != 0) continue; KASSERT(VM_PAGE_INQUEUE2(m, queue), ("vm_contig_launder: page %p's queue is not %d", m, queue)); error = vm_contig_launder_page(m, &next); if (error == 0) return (TRUE); if (error == EBUSY) return (FALSE); } return (FALSE); } /* * Frees the given physically contiguous pages. * * N.B.: Any pages with PG_ZERO set must, in fact, be zero filled. */ static void vm_page_release_contig(vm_page_t m, vm_pindex_t count) { while (count--) { /* Leave PG_ZERO unchanged. */ vm_page_free_toq(m); m++; } } /* * Allocates a region from the kernel address map, inserts the * given physically contiguous pages into the kernel object, * creates a wired mapping from the region to the pages, and * returns the region's starting virtual address. If M_ZERO is * specified through the given flags, then the pages are zeroed * before they are mapped. */ static void * contigmapping(vm_page_t m, vm_pindex_t npages, int flags) { vm_object_t object = kernel_object; vm_map_t map = kernel_map; vm_offset_t addr, tmp_addr; vm_pindex_t i; vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr) != KERN_SUCCESS) { vm_map_unlock(map); return (NULL); } vm_object_reference(object); vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS, addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); tmp_addr = addr; VM_OBJECT_LOCK(object); for (i = 0; i < npages; i++) { vm_page_insert(&m[i], object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); if ((flags & M_ZERO) && !(m[i].flags & PG_ZERO)) pmap_zero_page(&m[i]); tmp_addr += PAGE_SIZE; } VM_OBJECT_UNLOCK(object); vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); return ((void *)addr); } void * contigmalloc( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary) { void *ret; vm_page_t pages; unsigned long npgs; int actl, actmax, inactl, inactmax, tries; npgs = round_page(size) >> PAGE_SHIFT; tries = 0; retry: pages = vm_phys_alloc_contig(npgs, low, high, alignment, boundary); if (pages == NULL) { if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { vm_page_lock_queues(); inactl = 0; inactmax = tries < 1 ? 0 : cnt.v_inactive_count; actl = 0; actmax = tries < 2 ? 0 : cnt.v_active_count; again: if (inactl < inactmax && vm_contig_launder(PQ_INACTIVE)) { inactl++; goto again; } if (actl < actmax && vm_contig_launder(PQ_ACTIVE)) { actl++; goto again; } vm_page_unlock_queues(); tries++; goto retry; } ret = NULL; } else { ret = contigmapping(pages, npgs, flags); if (ret == NULL) vm_page_release_contig(pages, npgs); else malloc_type_allocated(type, npgs << PAGE_SHIFT); } return (ret); } void contigfree(void *addr, unsigned long size, struct malloc_type *type) { vm_pindex_t npgs; npgs = round_page(size) >> PAGE_SHIFT; kmem_free(kernel_map, (vm_offset_t)addr, size); malloc_type_freed(type, npgs << PAGE_SHIFT); } Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 175201) +++ head/sys/vm/vm_object.c (revision 175202) @@ -1,2257 +1,2257 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define EASY_SCAN_FACTOR 8 #define MSYNC_FLUSH_HARDSEQ 0x01 #define MSYNC_FLUSH_SOFTSEQ 0x02 /* * msync / VM object flushing optimizations */ static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ; SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags, CTLFLAG_RW, &msync_flush_flags, 0, ""); static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static void vm_object_qcollapse(vm_object_t object); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(object->cache == NULL, ("object %p has cached pages", object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; bzero(&object->mtx, sizeof(object->mtx)); VM_OBJECT_LOCK_INIT(object, "standard object"); /* These are true for any object that has been freed */ object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; return (0); } void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->root = NULL; object->type = type; object->size = size; object->generation = 1; object->ref_count = 1; object->flags = 0; if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) object->flags = OBJ_ONEMAPPING; object->pg_color = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif object->cache = NULL; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->flags &= ~bits; } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { struct vnode *vp; if (object == NULL) return; VM_OBJECT_LOCK(object); object->ref_count++; if (object->type == OBJT_VNODE) { int vfslocked; vp = object->handle; VM_OBJECT_UNLOCK(object); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vget(vp, LK_RETRY, curthread); VFS_UNLOCK_GIANT(vfslocked); } else VM_OBJECT_UNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference_locked: dead object referenced")); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VFS_ASSERT_GIANT(vp->v_mount); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vprint("vm_object_vndeallocate", vp); panic("vm_object_vndeallocate: bad object reference count"); } #endif object->ref_count--; if (object->ref_count == 0) { mp_fixme("Unlocked vflag access."); vp->v_vflag &= ~VV_TEXT; } VM_OBJECT_UNLOCK(object); /* * vrele may need a vop lock */ vrele(vp); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; while (object != NULL) { int vfslocked; vfslocked = 0; restart: VM_OBJECT_LOCK(object); if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *) object->handle; /* * Conditionally acquire Giant for a vnode-backed * object. We have to be careful since the type of * a vnode object can change while the object is * unlocked. */ if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) { vfslocked = 1; if (!mtx_trylock(&Giant)) { VM_OBJECT_UNLOCK(object); mtx_lock(&Giant); goto restart; } } vm_object_vndeallocate(object); VFS_UNLOCK_GIANT(vfslocked); return; } else /* * This is to handle the case that the object * changed type while we dropped its lock to * obtain Giant. */ VFS_UNLOCK_GIANT(vfslocked); KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_UNLOCK(object); return; } else if (object->ref_count == 1) { if (object->shadow_count == 0) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); if (!VM_OBJECT_TRYLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_UNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_UNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_LOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_UNLOCK(robject); object->flags |= OBJ_PIPWNT; msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "objde2", 0); VM_OBJECT_LOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_LOCK(object); goto retry; } } else VM_OBJECT_UNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_UNLOCK(object); continue; } VM_OBJECT_UNLOCK(robject); } VM_OBJECT_UNLOCK(object); return; } doterm: temp = object->backing_object; if (temp != NULL) { VM_OBJECT_LOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; temp->generation++; VM_OBJECT_UNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_UNLOCK(object); object = temp; } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); vinvalbuf(vp, V_SAVE, NULL, 0, 0); VM_OBJECT_LOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Now free any remaining pages. For internal objects, this also * removes them from paging queues. Don't free wired pages, just * remove them from the object. */ vm_page_lock_queues(); while ((p = TAILQ_FIRST(&object->memq)) != NULL) { KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0, ("vm_object_terminate: freeing busy page %p " "p->busy = %d, p->flags %x\n", p, p->busy, p->flags)); if (p->wire_count == 0) { vm_page_free(p); cnt.v_pfree++; } else { vm_page_remove(p); } } vm_page_unlock_queues(); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif if (__predict_false(object->cache != NULL)) vm_page_cache_free(object, 0, 0); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_UNLOCK(object); /* * Remove the object from the global object list. */ mtx_lock(&vm_object_list_mtx); TAILQ_REMOVE(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. */ void vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags) { vm_page_t p, np; vm_pindex_t tstart, tend; vm_pindex_t pi; int clearobjflags; int pagerflags; int curgeneration; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_VNODE || (object->flags & OBJ_MIGHTBEDIRTY) == 0) return; pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; vm_object_set_flag(object, OBJ_CLEANING); tstart = start; if (end == 0) { tend = object->size; } else { tend = end; } vm_page_lock_queues(); /* * If the caller is smart and only msync()s a range he knows is * dirty, we may be able to avoid an object scan. This results in * a phenominal improvement in performance. We cannot do this * as a matter of course because the object may be huge - e.g. * the size might be in the gigabytes or terrabytes. */ if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) { vm_pindex_t tscan; int scanlimit; int scanreset; scanreset = object->resident_page_count / EASY_SCAN_FACTOR; if (scanreset < 16) scanreset = 16; pagerflags |= VM_PAGER_IGNORE_CLEANCHK; scanlimit = scanreset; tscan = tstart; while (tscan < tend) { curgeneration = object->generation; p = vm_page_lookup(object, tscan); if (p == NULL || p->valid == 0) { if (--scanlimit == 0) break; ++tscan; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { if (--scanlimit == 0) break; ++tscan; continue; } /* * If we have been asked to skip nosync pages and * this is a nosync page, we can't continue. */ if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) { if (--scanlimit == 0) break; ++tscan; continue; } scanlimit = scanreset; /* * This returns 0 if it was unable to busy the first * page (i.e. had to sleep). */ tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags); } /* * If everything was dirty and we flushed it successfully, * and the requested range is not the entire object, we * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can * return immediately. */ if (tscan >= tend && (tstart || tend < object->size)) { vm_page_unlock_queues(); vm_object_clear_flag(object, OBJ_CLEANING); return; } pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK; } /* * Generally set CLEANCHK interlock and make the page read-only so * we can then clear the object flags. * * However, if this is a nosync mmap then the object is likely to * stay dirty so do not mess with the page and do not clear the * object flags. */ clearobjflags = 1; TAILQ_FOREACH(p, &object->memq, listq) { p->oflags |= VPO_CLEANCHK; if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) clearobjflags = 0; else pmap_remove_write(p); } if (clearobjflags && (tstart == 0) && (tend == object->size)) { struct vnode *vp; vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); if (object->type == OBJT_VNODE && (vp = (struct vnode *)object->handle) != NULL) { VI_LOCK(vp); if (vp->v_iflag & VI_OBJDIRTY) vp->v_iflag &= ~VI_OBJDIRTY; VI_UNLOCK(vp); } } rescan: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p; p = np) { int n; np = TAILQ_NEXT(p, listq); again: pi = p->pindex; if ((p->oflags & VPO_CLEANCHK) == 0 || (pi < tstart) || (pi >= tend) || p->valid == 0) { p->oflags &= ~VPO_CLEANCHK; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { p->oflags &= ~VPO_CLEANCHK; continue; } /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were * not cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) { p->oflags &= ~VPO_CLEANCHK; continue; } n = vm_object_page_collect_flush(object, p, curgeneration, pagerflags); if (n == 0) goto rescan; if (object->generation != curgeneration) goto rescan; /* * Try to optimize the next page. If we can't we pick up * our (random) scan where we left off. */ if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) { if ((p = vm_page_lookup(object, pi + n)) != NULL) goto again; } } vm_page_unlock_queues(); #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc); #endif vm_object_clear_flag(object, OBJ_CLEANING); return; } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags) { int runlen; int maxf; int chkb; int maxb; int i; vm_pindex_t pi; vm_page_t maf[vm_pageout_page_count]; vm_page_t mab[vm_pageout_page_count]; vm_page_t ma[vm_pageout_page_count]; mtx_assert(&vm_page_queue_mtx, MA_OWNED); pi = p->pindex; while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) { vm_page_lock_queues(); if (object->generation != curgeneration) { return(0); } } maxf = 0; for(i = 1; i < vm_pageout_page_count; i++) { vm_page_t tp; if ((tp = vm_page_lookup(object, pi + i)) != NULL) { if ((tp->oflags & VPO_BUSY) || ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && (tp->oflags & VPO_CLEANCHK) == 0) || (tp->busy != 0)) break; vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { tp->oflags &= ~VPO_CLEANCHK; break; } maf[ i - 1 ] = tp; maxf++; continue; } break; } maxb = 0; chkb = vm_pageout_page_count - maxf; if (chkb) { for(i = 1; i < chkb;i++) { vm_page_t tp; if ((tp = vm_page_lookup(object, pi - i)) != NULL) { if ((tp->oflags & VPO_BUSY) || ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && (tp->oflags & VPO_CLEANCHK) == 0) || (tp->busy != 0)) break; vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { tp->oflags &= ~VPO_CLEANCHK; break; } mab[ i - 1 ] = tp; maxb++; continue; } break; } } for(i = 0; i < maxb; i++) { int index = (maxb - i) - 1; ma[index] = mab[i]; ma[index]->oflags &= ~VPO_CLEANCHK; } p->oflags &= ~VPO_CLEANCHK; ma[maxb] = p; for(i = 0; i < maxf; i++) { int index = (maxb + i) + 1; ma[index] = maf[i]; ma[index]->oflags &= ~VPO_CLEANCHK; } runlen = maxb + maxf + 1; vm_pageout_flush(ma, runlen, pagerflags); for (i = 0; i < runlen; i++) { if (ma[i]->valid & ma[i]->dirty) { pmap_remove_write(ma[i]); ma[i]->oflags |= VPO_CLEANCHK; /* * maxf will end up being the actual number of pages * we wrote out contiguously, non-inclusive of the * first page. We do not count look-behind pages. */ if (i >= maxb + 1 && (maxf > i - maxb - 1)) maxf = i - maxb - 1; } } return(maxf + 1); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ void vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int flags; if (object == NULL) return; VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_UNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { int vfslocked; vp = object->handle; VM_OBJECT_UNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? OBJPC_INVAL : 0; VM_OBJECT_LOCK(object); vm_object_page_clean(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0, curthread); VFS_UNLOCK_GIANT(vfslocked); vn_finished_write(mp); VM_OBJECT_LOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { boolean_t purge; purge = old_msync || (object->type == OBJT_DEVICE); vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), purge ? FALSE : TRUE); } VM_OBJECT_UNLOCK(object); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise) { vm_pindex_t end, tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; VM_OBJECT_LOCK(object); end = pindex + count; /* * Locate and adjust resident pages */ for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ if (advise == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } m = vm_page_lookup(tobject, tpindex); if (m == NULL && advise == MADV_WILLNEED) { /* * If the page is cached, reactivate it. */ m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED | VM_ALLOC_NOBUSY); } if (m == NULL) { /* * There may be swap even if there is no backing page */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* * next object */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_LOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_UNLOCK(tobject); tobject = backing_object; goto shadowlookup; } /* * If the page is busy or not in a normal active state, * we skip it. If the page is not managed there are no * page queues to mess with. Things can break if we mess * with pages in any of the below states. */ vm_page_lock_queues(); if (m->hold_count || m->wire_count || (m->flags & PG_UNMANAGED) || m->valid != VM_PAGE_BITS_ALL) { vm_page_unlock_queues(); goto unlock_tobject; } if ((m->oflags & VPO_BUSY) || m->busy) { vm_page_flag_set(m, PG_REFERENCED); vm_page_unlock_queues(); if (object != tobject) VM_OBJECT_UNLOCK(object); m->oflags |= VPO_WANTED; msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo", 0); VM_OBJECT_LOCK(object); goto relookup; } if (advise == MADV_WILLNEED) { vm_page_activate(m); } else if (advise == MADV_DONTNEED) { vm_page_dontneed(m); } else if (advise == MADV_FREE) { /* * Mark the page clean. This will allow the page * to be freed up by the system. However, such pages * are often reused quickly by malloc()/free() * so we do not do anything that would cause * a page fault if we can help it. * * Specifically, we do not try to actually free * the page now nor do we try to put it in the * cache (which would cause a page fault on reuse). * * But we do make the page is freeable as we * can without actually taking the step of unmapping * it. */ pmap_clear_modify(m); m->dirty = 0; m->act_count = 0; vm_page_dontneed(m); } vm_page_unlock_queues(); if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: if (tobject != object) VM_OBJECT_UNLOCK(tobject); } VM_OBJECT_UNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_LOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_UNLOCK(source); return; } VM_OBJECT_UNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, length); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_LOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; source->generation++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & (OBJ_NEEDGIANT | OBJ_COLORED); result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #else result->flags |= source->flags & OBJ_NEEDGIANT; #endif VM_OBJECT_UNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_UNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_LOCK(new_object); VM_OBJECT_LOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_LOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_UNLOCK(source); VM_OBJECT_UNLOCK(orig_object); VM_OBJECT_UNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_LOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; source->generation++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } new_object->flags |= orig_object->flags & OBJ_NEEDGIANT; retry: if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) { if (m->pindex < offidxstart) { m = vm_page_splay(offidxstart, orig_object->root); if ((orig_object->root = m)->pindex < offidxstart) m = TAILQ_NEXT(m, listq); } } vm_page_lock_queues(); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if ((m->oflags & VPO_BUSY) || m->busy) { vm_page_flag_set(m, PG_REFERENCED); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(new_object); m->oflags |= VPO_WANTED; msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0); VM_OBJECT_LOCK(new_object); goto retry; } vm_page_rename(m, new_object, idx); /* page automatically made dirty by rename and cache handled */ vm_page_busy(m); } vm_page_unlock_queues(); if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); /* * Transfer any cached pages from orig_object to new_object. */ if (__predict_false(orig_object->cache != NULL)) vm_page_cache_transfer(orig_object, offidxstart, new_object); } VM_OBJECT_UNLOCK(orig_object); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_wakeup(m); VM_OBJECT_UNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_LOCK(new_object); } #define OBSC_TEST_ALL_SHADOWED 0x0001 #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static int vm_object_backing_scan(vm_object_t object, int op) { int r = 1; vm_page_t p; vm_object_t backing_object; vm_pindex_t backing_offset_index; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if (op & OBSC_TEST_ALL_SHADOWED) { /* * We do not want to have to test for the existence of cache * or swap pages in the backing object. XXX but with the * new swapper this would be pretty easy to do. * * XXX what about anonymous MAP_SHARED memory that hasn't * been ZFOD faulted yet? If we do not test for this, the * shadow test may succeed! XXX */ if (backing_object->type != OBJT_DEFAULT) { return (0); } } if (op & OBSC_COLLAPSE_WAIT) { vm_object_set_flag(backing_object, OBJ_DEAD); } /* * Our scan */ p = TAILQ_FIRST(&backing_object->memq); while (p) { vm_page_t next = TAILQ_NEXT(p, listq); vm_pindex_t new_pindex = p->pindex - backing_offset_index; if (op & OBSC_TEST_ALL_SHADOWED) { vm_page_t pp; /* * Ignore pages outside the parent object's range * and outside the parent object's mapping of the * backing object. * * note that we do not busy the backing object's * page. */ if ( p->pindex < backing_offset_index || new_pindex >= object->size ) { p = next; continue; } /* * See if the parent has the page or if the parent's * object pager has the page. If the parent has the * page but the page is not valid, the parent's * object pager must have the page. * * If this fails, the parent does not completely shadow * the object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ( (pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL) ) { r = 0; break; } } /* * Check for busy page */ if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) { vm_page_t pp; if (op & OBSC_COLLAPSE_NOWAIT) { if ((p->oflags & VPO_BUSY) || !p->valid || p->busy) { p = next; continue; } } else if (op & OBSC_COLLAPSE_WAIT) { if ((p->oflags & VPO_BUSY) || p->busy) { vm_page_lock_queues(); vm_page_flag_set(p, PG_REFERENCED); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); p->oflags |= VPO_WANTED; msleep(p, VM_OBJECT_MTX(backing_object), PDROP | PVM, "vmocol", 0); VM_OBJECT_LOCK(object); VM_OBJECT_LOCK(backing_object); /* * If we slept, anything could have * happened. Since the object is * marked dead, the backing offset * should not have changed so we * just restart our scan. */ p = TAILQ_FIRST(&backing_object->memq); continue; } } KASSERT( p->object == backing_object, ("vm_object_backing_scan: object mismatch") ); /* * Destroy any associated swap */ if (backing_object->type == OBJT_SWAP) { swap_pager_freespace( backing_object, p->pindex, 1 ); } if ( p->pindex < backing_offset_index || new_pindex >= object->size ) { /* * Page is out of the parent object's range, we * can simply destroy it. */ vm_page_lock_queues(); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock_queues(); p = next; continue; } pp = vm_page_lookup(object, new_pindex); if ( pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL) ) { /* * page already exists in parent OR swap exists * for this location in the parent. Destroy * the original page from the backing object. * * Leave the parent's page alone */ vm_page_lock_queues(); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock_queues(); p = next; continue; } #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif /* * Page does not exist in parent, rename the * page from the backing object to the main object. * * If the page was mapped to a process, it can remain * mapped through the rename. */ vm_page_lock_queues(); vm_page_rename(p, object, new_pindex); vm_page_unlock_queues(); /* page automatically made dirty by rename */ } p = next; } return (r); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED); if (backing_object->ref_count != 1) return; vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); while (TRUE) { vm_object_t backing_object; /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_LOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(backing_object); break; } if ( object->paging_in_progress != 0 || backing_object->paging_in_progress != 0 ) { vm_object_qcollapse(object); VM_OBJECT_UNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_backing_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); /* * Free any cached pages from backing_object. */ if (__predict_false(backing_object->cache != NULL)) vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; backing_object->generation++; if (backing_object->backing_object) { VM_OBJECT_LOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ backing_object->backing_object->generation++; VM_OBJECT_UNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object)); VM_OBJECT_UNLOCK(backing_object); mtx_lock(&vm_object_list_mtx); TAILQ_REMOVE( &vm_object_list, backing_object, object_list ); mtx_unlock(&vm_object_list_mtx); uma_zfree(obj_zone, backing_object); object_collapses++; } else { vm_object_t new_backing_object; /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && vm_object_backing_scan(object, OBSC_TEST_ALL_SHADOWED) == 0) { VM_OBJECT_UNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; backing_object->generation++; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_LOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; new_backing_object->generation++; vm_object_reference_locked(new_backing_object); VM_OBJECT_UNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_UNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * Removes all physical pages in the given range from the * object's list of pages. If the range's end is zero, all * physical pages from the range's start to the end of the object * are deleted. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, boolean_t clean_only) { vm_page_t p, next; int wirings; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->resident_page_count == 0) goto skipmemq; /* * Since physically-backed objects do not use managed pages, we can't * remove pages from the object (we must instead remove the page * references, and then destroy the object). */ KASSERT(object->type != OBJT_PHYS || object == kernel_object || object == kmem_object, ("attempt to remove pages from a physical object")); vm_object_pip_add(object, 1); again: vm_page_lock_queues(); if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < start) { p = vm_page_splay(start, object->root); if ((object->root = p)->pindex < start) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the * existence of managed, wired mappings, then it cannot * be freed. */ if ((wirings = p->wire_count) != 0 && (wirings = pmap_page_wired_mappings(p)) != p->wire_count) { pmap_remove_all(p); /* Account for removal of managed, wired mappings. */ p->wire_count -= wirings; if (!clean_only) p->valid = 0; continue; } if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) goto again; if (clean_only && p->valid) { pmap_remove_write(p); if (p->valid & p->dirty) continue; } pmap_remove_all(p); /* Account for removal of managed, wired mappings. */ if (wirings != 0) p->wire_count -= wirings; vm_page_free(p); } vm_page_unlock_queues(); vm_object_pip_wakeup(object); skipmemq: if (__predict_false(object->cache != NULL)) vm_page_cache_free(object, start, end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_LOCK(prev_object); if (prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) { VM_OBJECT_UNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_UNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_UNLOCK(prev_object); return (FALSE); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, FALSE); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_UNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { struct vnode *vp; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); if (object->type == OBJT_VNODE && (vp = (struct vnode *)object->handle) != NULL) { VI_LOCK(vp); vp->v_iflag |= VI_OBJDIRTY; VI_UNLOCK(vp); } } #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; if (_vm_object_in_map(kmem_map, object, 0)) return 1; if (_vm_object_in_map(pager_map, object, 0)) return 1; if (_vm_object_in_map(buffer_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; int nl = 0; int c; TAILQ_FOREACH(object, &vm_object_list, object_list) { vm_pindex_t idx, fidx; vm_pindex_t osize; vm_paddr_t pa = -1; int rcount; vm_page_t m; db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; osize = object->size; if (osize > 128) osize = 128; for (idx = 0; idx < osize; idx++) { m = vm_page_lookup(object, idx); if (m == NULL) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } continue; } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = idx; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 175201) +++ head/sys/vm/vnode_pager.c (revision 175202) @@ -1,1212 +1,1212 @@ /*- * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, }; int vnode_pbuf_freecnt; /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) return (0); while ((object = vp->v_object) != NULL) { VM_OBJECT_LOCK(object); if (!(object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(object); return (0); } VOP_UNLOCK(vp, 0, td); vm_object_set_flag(object, OBJ_DISCONNECTWNT); msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } if (size == 0) { if (vn_isdisk(vp, NULL)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred, td) != 0) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ VM_OBJECT_LOCK(object); object->ref_count--; VM_OBJECT_UNLOCK(object); vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_LOCK(obj); if (obj->ref_count == 0) { /* * vclean() may be called twice. The first time * removes the primary reference to the object, * the second time goes one further and is a * special-case to terminate the object. * * don't double-terminate the object */ if ((obj->flags & OBJ_DEAD) == 0) vm_object_terminate(obj); else VM_OBJECT_UNLOCK(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_UNLOCK(obj); } vp->v_object = NULL; } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. * * MPSAFE */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *) handle; ASSERT_VOP_ELOCKED(vp, "vnode_pager_alloc"); /* * If the object is being terminated, wait for it to * go away. */ while ((object = vp->v_object) != NULL) { VM_OBJECT_LOCK(object); if ((object->flags & OBJ_DEAD) == 0) break; vm_object_set_flag(object, OBJ_DISCONNECTWNT); msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0); } if (vp->v_usecount == 0) panic("vnode_pager_alloc: no vnode reference"); if (object == NULL) { /* * And an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->handle = handle; if (VFS_NEEDSGIANT(vp->v_mount)) vm_object_set_flag(object, OBJ_NEEDGIANT); vp->v_object = object; } else { object->ref_count++; VM_OBJECT_UNLOCK(object); } vref(vp); return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(object) vm_object_t object; { struct vnode *vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); vm_object_pip_wait(object, "vnpdea"); object->handle = NULL; object->type = OBJT_DEAD; if (object->flags & OBJ_DISCONNECTWNT) { vm_object_clear_flag(object, OBJ_DISCONNECTWNT); wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); vp->v_object = NULL; vp->v_vflag &= ~VV_TEXT; } static boolean_t vnode_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; int vfslocked; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || vp->v_iflag & VI_DOOMED) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } VM_OBJECT_UNLOCK(object); vfslocked = VFS_LOCK_GIANT(vp->v_mount); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VFS_UNLOCK_GIANT(vfslocked); VM_OBJECT_LOCK(object); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { int numafter; *after *= pagesperblock; numafter = pagesperblock - (poff + 1); if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) { numafter = OFF_TO_IDX(object->un_pager.vnp.vnp_size) - pindex; } *after += numafter; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(vp, nsize) struct vnode *vp; vm_ooffset_t nsize; { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; VM_OBJECT_LOCK(object); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_UNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, FALSE); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if ((nsize & PAGE_MASK) && (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && m->valid != 0) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Clear out partial-page dirty bits. This * has the side effect of setting the valid * bits, but that is ok. There are a bunch * of places in the VM system where we expected * m->dirty == VM_PAGE_BITS_ALL. The file EOF * case is one of them. If the page is still * partially dirty, make it fully dirty. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_lock_queues(); vm_page_set_validclean(m, base, size); if (m->dirty != 0) m->dirty = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); } else if ((nsize & PAGE_MASK) && __predict_false(object->cache != NULL)) { vm_page_cache_free(object, OFF_TO_IDX(nsize), nobjsize); } } object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_UNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (vp->v_iflag & VI_DOOMED) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(object, m) vm_object_t object; vm_page_t m; { int i; struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; int error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; if (vm_page_bits(i * bsize, bsize) & m->valid) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; VM_OBJECT_LOCK(object); vm_page_lock_queues(); vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); } else { VM_OBJECT_LOCK(object); vm_page_lock_queues(); vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); } } sf_buf_free(sf); vm_page_lock_queues(); pmap_clear_modify(m); vm_page_unlock_queues(); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(object, m) vm_object_t object; vm_page_t m; { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_UNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_LOCK(object); } vm_page_lock_queues(); pmap_clear_modify(m); vm_page_undirty(m); vm_page_unlock_queues(); if (!error) m->valid = VM_PAGE_BITS_ALL; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; int vfslocked; vp = object->handle; VM_OBJECT_UNLOCK(object); vfslocked = VFS_LOCK_GIANT(vp->v_mount); rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); VFS_UNLOCK_GIANT(vfslocked); VM_OBJECT_LOCK(object); return rtval; } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(vp, m, bytecount, reqpage) struct vnode *vp; vm_page_t *m; int bytecount; int reqpage; { vm_object_t object; vm_offset_t kva; off_t foff, tfoff, nextoff; int i, j, size, bsize, first; daddr_t firstaddr, reqblock; struct bufobj *bo; int runpg; int runend; struct buf *bp; int count; int error; object = vp->v_object; count = bytecount / PAGE_SIZE; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("vnode_pager_generic_getpages does not support devices")); if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; /* get the UNDERLYING device for the file with VOP_BMAP() */ /* * originally, we did not check for an error return value -- assuming * an fs always has a bmap entry point -- that assumption is wrong!!! */ foff = IDX_TO_OFF(m[reqpage]->pindex); /* * if we can't bmap, use old VOP code */ error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL); if (error == EOPNOTSUPP) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (i = 0; i < count; i++) if (i != reqpage) vm_page_free(m[i]); vm_page_unlock_queues(); PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[reqpage]); VM_OBJECT_UNLOCK(object); return (error); } else if (error != 0) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (i = 0; i < count; i++) if (i != reqpage) vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return (VM_PAGER_ERROR); /* * if the blocksize is smaller than a page size, then use * special small filesystem code. NFS sometimes has a small * blocksize, but it can handle large reads itself. */ } else if ((PAGE_SIZE / bsize) > 1 && (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (i = 0; i < count; i++) if (i != reqpage) vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); return vnode_pager_input_smlfs(object, m[reqpage]); } /* * If we have a completely valid page available to us, we can * clean up and return. Otherwise we have to re-read the * media. */ VM_OBJECT_LOCK(object); if (m[reqpage]->valid == VM_PAGE_BITS_ALL) { vm_page_lock_queues(); for (i = 0; i < count; i++) if (i != reqpage) vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return VM_PAGER_OK; } else if (reqblock == -1) { pmap_zero_page(m[reqpage]); vm_page_undirty(m[reqpage]); m[reqpage]->valid = VM_PAGE_BITS_ALL; vm_page_lock_queues(); for (i = 0; i < count; i++) if (i != reqpage) vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return (VM_PAGER_OK); } m[reqpage]->valid = 0; VM_OBJECT_UNLOCK(object); /* * here on direct device I/O */ firstaddr = -1; /* * calculate the run that includes the required page */ for (first = 0, i = 0; i < count; i = runend) { if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr, &runpg) != 0) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (; i < count; i++) if (i != reqpage) vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return (VM_PAGER_ERROR); } if (firstaddr == -1) { VM_OBJECT_LOCK(object); if (i == reqpage && foff < object->un_pager.vnp.vnp_size) { panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx", (intmax_t)firstaddr, (uintmax_t)(foff >> 32), (uintmax_t)foff, (uintmax_t) (object->un_pager.vnp.vnp_size >> 32), (uintmax_t)object->un_pager.vnp.vnp_size); } vm_page_lock_queues(); vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); runend = i + 1; first = runend; continue; } runend = i + runpg; if (runend <= reqpage) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (j = i; j < runend; j++) vm_page_free(m[j]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); } else { if (runpg < (count - first)) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (i = first + runpg; i < count; i++) vm_page_free(m[i]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); count = first + runpg; } break; } first = runend; } /* * the first and last page have been calculated now, move input pages * to be zero based... */ if (first != 0) { m += first; count -= first; reqpage -= first; } /* * calculate the file virtual address for the transfer */ foff = IDX_TO_OFF(m[0]->pindex); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; KASSERT(count > 0, ("zero count")); if ((foff + size) > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - foff; KASSERT(size > 0, ("zero size")); /* * round up physical size for real devices. */ if (1) { int secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("vnode_pager_generic_getpages: sector size %d too large", secmask + 1)); size = (size + secmask) & ~secmask; } bp = getpbuf(&vnode_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; /* * and map the pages to be read into the kva */ pmap_qenter(kva, m, count); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_blkno = firstaddr; pbgetbo(bo, bp); bp->b_bcount = size; bp->b_bufsize = size; bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, count); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnread"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; if (!error) { if (size != count * PAGE_SIZE) bzero((caddr_t) kva + size, PAGE_SIZE * count - size); } pmap_qremove(kva, count); /* * free the buffer header back to the swap buffer pool */ pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = m[i]; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ mt->valid = VM_PAGE_BITS_ALL; vm_page_undirty(mt); /* should be an assert? XXX */ pmap_clear_modify(mt); } else { /* * Read did not fill up entire page. Since this * is getpages, the page may be mapped, so we have * to zero the invalid portions of the page even * though we aren't setting them valid. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_validclean(mt, 0, object->un_pager.vnp.vnp_size - tfoff); /* handled by vm_fault now */ /* vm_page_zero_invalid(mt, FALSE); */ } if (i != reqpage) { /* * whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere. (it already is in the object). Result: * It appears that empirical results show that * deactivating pages is best. */ /* * just in case someone was asking for this page we * now tell them that it is ok to use */ if (!error) { if (mt->oflags & VPO_WANTED) vm_page_activate(mt); else vm_page_deactivate(mt); vm_page_wakeup(mt); } else { vm_page_free(mt); } } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); if (error) { printf("vnode_pager_getpages: I/O read error\n"); } return (error ? VM_PAGER_ERROR : VM_PAGER_OK); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { int rtval; struct vnode *vp; struct mount *mp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min) sync |= OBJPC_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_UNLOCK(object); if (vp->v_type != VREG) mp = NULL; rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_LOCK(object); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occured, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals) struct vnode *vp; vm_page_t *m; int bytecount; int flags; int *rtvals; { int i; vm_object_t object; int count; int maxsize, ncount; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int error; int ioflags; int ppscheck = 0; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; if ((int64_t)m[0]->pindex < 0) { printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n", (long)m[0]->pindex, (u_long)m[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(m[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occuring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page locked we are free to fix-up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { int pgoff; maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { vm_page_lock_queues(); vm_page_clear_dirty(m[ncount - 1], pgoff, PAGE_SIZE - pgoff); vm_page_unlock_queues(); } } else { maxsize = 0; ncount = 0; } if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } } } /* * pageouts are already clustered, use IO_ASYNC t o force a bawrite() * rather then a bdwrite() to prevent paging I/O from saturating * the buffer cache. Dummy-up the sequential heuristic to cause * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set, * the system decides how to cluster. */ ioflags = IO_VMIO; if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ioflags |= IO_SYNC; else if ((flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_td = (struct thread *) 0; error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, ncount); if (error) { if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1))) printf("vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { if (ppscheck || ppsratecheck(&lastfail, &curfail, 1)) printf("vnode_pager_putpages: residual I/O %d at %lu\n", auio.uio_resid, (u_long)m[0]->pindex); } for (i = 0; i < ncount; i++) { rtvals[i] = VM_PAGER_OK; } return rtvals[0]; } struct vnode * vnode_pager_lock(vm_object_t first_object) { struct vnode *vp; vm_object_t backing_object, object; VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED); for (object = first_object; object != NULL; object = backing_object) { if (object->type != OBJT_VNODE) { if ((backing_object = object->backing_object) != NULL) VM_OBJECT_LOCK(backing_object); if (object != first_object) VM_OBJECT_UNLOCK(object); continue; } retry: if (object->flags & OBJ_DEAD) { if (object != first_object) VM_OBJECT_UNLOCK(object); return NULL; } vp = object->handle; VI_LOCK(vp); VM_OBJECT_UNLOCK(object); if (first_object != object) VM_OBJECT_UNLOCK(first_object); VFS_ASSERT_GIANT(vp->v_mount); if (vget(vp, LK_CANRECURSE | LK_INTERLOCK | LK_RETRY | LK_SHARED, curthread)) { VM_OBJECT_LOCK(first_object); if (object != first_object) VM_OBJECT_LOCK(object); if (object->type != OBJT_VNODE) { if (object != first_object) VM_OBJECT_UNLOCK(object); return NULL; } printf("vnode_pager_lock: retrying\n"); goto retry; } VM_OBJECT_LOCK(first_object); return (vp); } return NULL; }