Index: vendor-sys/illumos/dist/uts/common/fs/gfs.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/gfs.c (revision 318932) +++ vendor-sys/illumos/dist/uts/common/fs/gfs.c (revision 318933) @@ -1,1178 +1,1179 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Portions Copyright 2007 Shivakumar GN */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Generic pseudo-filesystem routines. * * There are significant similarities between the implementation of certain file * system entry points across different filesystems. While one could attempt to * "choke up on the bat" and incorporate common functionality into a VOP * preamble or postamble, such an approach is limited in the benefit it can * provide. In this file we instead define a toolkit of routines which can be * called from a filesystem (with in-kernel pseudo-filesystems being the focus * of the exercise) in a more component-like fashion. * * There are three basic classes of routines: * * 1) Lowlevel support routines * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, * without forcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() * gfs_readdir_emit() * gfs_readdir_emitn() * gfs_readdir_pred() * gfs_readdir_fini() * gfs_lookup_dot() * * 2) Complete GFS management * * These routines take a more active role in management of the * pseudo-filesystem. They handle the relationship between vnode private * data and VFS data, as well as the relationship between vnodes in the * directory hierarchy. * * In order to use these interfaces, the first member of every private * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control * to GFS. * * gfs_file_create() * gfs_dir_create() * gfs_root_create() * * gfs_file_inactive() * gfs_dir_inactive() * gfs_dir_lookup() * gfs_dir_readdir() * * gfs_vop_inactive() * gfs_vop_lookup() * gfs_vop_readdir() * gfs_vop_map() * * 3) Single File pseudo-filesystems * * This routine creates a rooted file to be overlayed ontop of another * file in the physical filespace. * * Note that the parent is NULL (actually the vfs), but there is nothing * technically keeping such a file from utilizing the "Complete GFS * management" set of routines. * * gfs_root_create_file() */ /* * gfs_make_opsvec: take an array of vnode type definitions and create * their vnodeops_t structures * * This routine takes an array of gfs_opsvec_t's. It could * alternatively take an array of gfs_opsvec_t*'s, which would allow * vnode types to be completely defined in files external to the caller * of gfs_make_opsvec(). As it stands, much more sharing takes place -- * both the caller and the vnode type provider need to access gfsv_ops * and gfsv_template, and the caller also needs to know gfsv_name. */ int gfs_make_opsvec(gfs_opsvec_t *vec) { int error, i; for (i = 0; ; i++) { if (vec[i].gfsv_name == NULL) return (0); error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, vec[i].gfsv_ops); if (error) break; } cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", vec[i].gfsv_name); for (i--; i >= 0; i--) { vn_freevnodeops(*vec[i].gfsv_ops); *vec[i].gfsv_ops = NULL; } return (error); } /* * Low level directory routines * * These routines provide some simple abstractions for reading directories. * They are designed to be used by existing pseudo filesystems (namely procfs) * that already have a complicated management infrastructure. */ /* * gfs_get_parent_ino: used to obtain a parent inode number and the * inode number of the given vnode in preparation for calling gfs_readdir_init. */ int gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, ino64_t *pino, ino64_t *ino) { vnode_t *parent; gfs_dir_t *dp = dvp->v_data; int error; *ino = dp->gfsd_file.gfs_ino; parent = dp->gfsd_file.gfs_parent; if (parent == NULL) { *pino = *ino; /* root of filesystem */ } else if (dvp->v_flag & V_XATTRDIR) { vattr_t va; va.va_mask = AT_NODEID; error = VOP_GETATTR(parent, &va, 0, cr, ct); if (error) return (error); *pino = va.va_nodeid; } else { *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; } return (0); } /* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length * ureclen - the exported file-space record length (1 for non-legacy FSs) * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode * flags - flags from VOP_READDIR * * Returns 0 or a non-zero errno. * * Typical VOP_READDIR usage of gfs_readdir_*: * * if ((error = gfs_readdir_init(...)) != 0) * return (error); * eof = 0; * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { * if (!consumer_entry_at(voffset)) * voffset = consumer_next_entry(voffset); * if (consumer_eof(voffset)) { * eof = 1 * break; * } * if ((error = gfs_readdir_emit(..., voffset, * consumer_ino(voffset), consumer_name(voffset))) != 0) * break; * } * return (gfs_readdir_fini(..., error, eofp, eof)); * * As you can see, a zero result from gfs_readdir_pred() or * gfs_readdir_emit() indicates that processing should continue, * whereas a non-zero result indicates that the loop should terminate. * Most consumers need do nothing more than let gfs_readdir_fini() * determine what the cause of failure was and return the appropriate * value. */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, uio_t *uiop, ino64_t parent, ino64_t self, int flags) { size_t dirent_size; if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; if (flags & V_RDDIR_ENTFLAGS) dirent_size = EDIRENT_RECLEN(st->grd_namlen); else dirent_size = DIRENT64_RECLEN(st->grd_namlen); st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); st->grd_parent = parent; st->grd_self = self; st->grd_flags = flags; return (0); } /* * gfs_readdir_emit_int: internal routine to emit directory entry * * st - the current readdir state, which must have d_ino/ed_ino * and d_name/ed_name set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ static int gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next) { int reclen; dirent64_t *dp; edirent_t *edp; if (st->grd_flags & V_RDDIR_ENTFLAGS) { edp = st->grd_dirent; reclen = EDIRENT_RECLEN(strlen(edp->ed_name)); } else { dp = st->grd_dirent; reclen = DIRENT64_RECLEN(strlen(dp->d_name)); } if (reclen > uiop->uio_resid) { /* * Error if no entries were returned yet */ if (uiop->uio_resid == st->grd_oresid) return (EINVAL); return (-1); } if (st->grd_flags & V_RDDIR_ENTFLAGS) { edp->ed_off = next; edp->ed_reclen = (ushort_t)reclen; } else { dp->d_off = next; dp->d_reclen = (ushort_t)reclen; } if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); uiop->uio_loffset = next; return (0); } /* * gfs_readdir_emit: emit a directory entry * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name * eflags - value for ed_eflags (if processing edirent_t) * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, ino64_t ino, const char *name, int eflags) { offset_t off = (voff + 2) * st->grd_ureclen; if (st->grd_flags & V_RDDIR_ENTFLAGS) { edirent_t *edp = st->grd_dirent; edp->ed_ino = ino; (void) strncpy(edp->ed_name, name, st->grd_namlen); edp->ed_eflags = eflags; } else { dirent64_t *dp = st->grd_dirent; dp->d_ino = ino; (void) strncpy(dp->d_name, name, st->grd_namlen); } /* * Inter-entry offsets are invalid, so we assume a record size of * grd_ureclen and explicitly set the offset appropriately. */ return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen)); } /* * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer * instead of a string for the entry's name. */ int gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, ino64_t ino, unsigned long num) { char buf[40]; numtos(num, buf); return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); } /* * gfs_readdir_pred: readdir loop predicate * voffp - a pointer in which the next virtual offset should be stored * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp) { offset_t off, voff; int error; top: if (uiop->uio_resid <= 0) return (-1); off = uiop->uio_loffset / st->grd_ureclen; voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, ".", 0)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, "..", 0)) == 0) goto top; } else { *voffp = voff; return (0); } return (error); } /* * gfs_readdir_fini: generic readdir cleanup * error - if positive, an error to return * eofp - the eofp passed to readdir * eof - the eof value * * Returns a 0 on success, a non-zero errno on failure. This result * should be returned from readdir. */ int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { size_t dirent_size; if (st->grd_flags & V_RDDIR_ENTFLAGS) dirent_size = EDIRENT_RECLEN(st->grd_namlen); else dirent_size = DIRENT64_RECLEN(st->grd_namlen); kmem_free(st->grd_dirent, dirent_size); if (error > 0) return (error); if (eofp) *eofp = eof; return (0); } /* * gfs_lookup_dot * * Performs a basic check for "." and ".." directory entries. */ int gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) { if (*nm == '\0' || strcmp(nm, ".") == 0) { VN_HOLD(dvp); *vpp = dvp; return (0); } else if (strcmp(nm, "..") == 0) { if (pvp == NULL) { ASSERT(dvp->v_flag & VROOT); VN_HOLD(dvp); *vpp = dvp; } else { VN_HOLD(pvp); *vpp = pvp; } return (0); } return (-1); } /* * gfs_file_create(): create a new GFS file * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * * In order to use this interface, the parent vnode must have been created by * gfs_dir_create(), and the private data stored in v_data must have a * 'gfs_file_t' as its first field. * * Given these constraints, this routine will automatically: * * - Allocate v_data for the vnode * - Initialize necessary fields in the vnode * - Hold the parent */ vnode_t * gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops) { gfs_file_t *fp; vnode_t *vp; /* * Allocate vnode and internal data structure */ fp = kmem_zalloc(size, KM_SLEEP); vp = vn_alloc(KM_SLEEP); /* * Set up various pointers */ fp->gfs_vnode = vp; fp->gfs_parent = pvp; vp->v_data = fp; fp->gfs_size = size; fp->gfs_type = GFS_FILE; /* * Initialize vnode and hold parent. */ vn_setops(vp, ops); if (pvp) { VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0); VN_HOLD(pvp); } return (vp); } /* * gfs_dir_create: creates a new directory in the parent * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * entries - NULL-terminated list of static entries (if any) * maxlen - maximum length of a directory entry * readdir_cb - readdir callback (see gfs_dir_readdir) * inode_cb - inode callback (see gfs_dir_readdir) * lookup_cb - lookup callback (see gfs_dir_lookup) * * In order to use this function, the first member of the private vnode * structure (v_data) must be a gfs_dir_t. For each directory, there are * static entries, defined when the structure is initialized, and dynamic * entries, retrieved through callbacks. * * If a directory has static entries, then it must supply a inode callback, * which will compute the inode number based on the parent and the index. * For a directory with dynamic entries, the caller must supply a readdir * callback and a lookup callback. If a static lookup fails, we fall back to * the supplied lookup callback, if any. * * This function also performs the same initialization as gfs_file_create(). */ vnode_t * gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; gfs_dir_t *dp; gfs_dirent_t *de; vp = gfs_file_create(struct_size, pvp, ops); vp->v_type = VDIR; dp = vp->v_data; dp->gfsd_file.gfs_type = GFS_DIR; dp->gfsd_maxlen = maxlen; if (entries != NULL) { for (de = entries; de->gfse_name != NULL; de++) dp->gfsd_nstatic++; dp->gfsd_static = kmem_alloc( dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); bcopy(entries, dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } dp->gfsd_readdir = readdir_cb; dp->gfsd_lookup = lookup_cb; dp->gfsd_inode = inode_cb; mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); return (vp); } /* * gfs_root_create(): create a root vnode for a GFS filesystem * * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The * only difference is that it takes a vfs_t instead of a vnode_t as its parent. */ vnode_t * gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb, maxlen, readdir_cb, lookup_cb); /* Manually set the inode */ ((gfs_file_t *)vp->v_data)->gfs_ino = ino; VFS_HOLD(vfsp); VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0); vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; return (vp); } /* * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem * * Similar to gfs_root_create(), this creates a root vnode for a file to * be the pseudo-filesystem. */ vnode_t * gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) { vnode_t *vp = gfs_file_create(size, NULL, ops); ((gfs_file_t *)vp->v_data)->gfs_ino = ino; VFS_HOLD(vfsp); VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; return (vp); } /* * gfs_file_inactive() * * Called from the VOP_INACTIVE() routine. If necessary, this routine will * remove the given vnode from the parent directory and clean up any references * in the VFS layer. * * If the vnode was not removed (due to a race with vget), then NULL is * returned. Otherwise, a pointer to the private data is returned. */ void * gfs_file_inactive(vnode_t *vp) { int i; gfs_dirent_t *ge = NULL; gfs_file_t *fp = vp->v_data; gfs_dir_t *dp = NULL; void *data; if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) goto found; dp = fp->gfs_parent->v_data; /* * First, see if this vnode is cached in the parent. */ gfs_dir_lock(dp); /* * Find it in the set of static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (ge->gfse_vnode == vp) goto found; } /* * If 'ge' is NULL, then it is a dynamic entry. */ ge = NULL; found: if (vp->v_flag & V_XATTRDIR) { mutex_enter(&fp->gfs_parent->v_lock); } mutex_enter(&vp->v_lock); if (vp->v_count == 1) { /* * Really remove this vnode */ data = vp->v_data; if (ge != NULL) { /* * If this was a statically cached entry, simply set the * cached vnode to NULL. */ ge->gfse_vnode = NULL; } if (vp->v_flag & V_XATTRDIR) { fp->gfs_parent->v_xattrdir = NULL; mutex_exit(&fp->gfs_parent->v_lock); } mutex_exit(&vp->v_lock); /* * Free vnode and release parent */ if (fp->gfs_parent) { if (dp) { gfs_dir_unlock(dp); } VN_RELE(fp->gfs_parent); } else { ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } vn_free(vp); } else { - vp->v_count--; + VN_RELE_LOCKED(vp); data = NULL; mutex_exit(&vp->v_lock); if (vp->v_flag & V_XATTRDIR) { mutex_exit(&fp->gfs_parent->v_lock); } if (dp) gfs_dir_unlock(dp); } return (data); } /* * gfs_dir_inactive() * * Same as above, but for directories. */ void * gfs_dir_inactive(vnode_t *vp) { gfs_dir_t *dp; ASSERT(vp->v_type == VDIR); if ((dp = gfs_file_inactive(vp)) != NULL) { mutex_destroy(&dp->gfsd_lock); if (dp->gfsd_nstatic) kmem_free(dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } return (dp); } /* * gfs_dir_lookup_dynamic() * * This routine looks up the provided name amongst the dynamic entries * in the gfs directory and returns the corresponding vnode, if found. * * The gfs directory is expected to be locked by the caller prior to * calling this function. The directory will be unlocked during the * execution of this function, but will be locked upon return from the * function. This function returns 0 on success, non-zero on error. * * The dynamic lookups are performed by invoking the lookup * callback, which is passed to this function as the first argument. * The arguments to the callback are: * * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, * int flags, int *deflgs, pathname_t *rpnp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode * cr - pointer to cred * flags - flags value from lookup request * ignored here; currently only used to request * insensitive lookups * direntflgs - output parameter, directory entry flags * ignored here; currently only used to indicate a lookup * has more than one possible match when case is not considered * realpnp - output parameter, real pathname * ignored here; when lookup was performed case-insensitively, * this field contains the "real" name of the file. * * Returns 0 on success, non-zero on error. */ static int gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, int *direntflags, pathname_t *realpnp) { gfs_file_t *fp; ino64_t ino; int ret; ASSERT(GFS_DIR_LOCKED(dp)); /* * Drop the directory lock, as the lookup routine * will need to allocate memory, or otherwise deadlock on this * directory. */ gfs_dir_unlock(dp); ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); gfs_dir_lock(dp); /* * The callback for extended attributes returns a vnode * with v_data from an underlying fs. */ if (ret == 0 && !IS_XATTRDIR(dvp)) { fp = (gfs_file_t *)((*vpp)->v_data); fp->gfs_index = -1; fp->gfs_ino = ino; } return (ret); } /* * gfs_dir_lookup_static() * * This routine looks up the provided name amongst the static entries * in the gfs directory and returns the corresponding vnode, if found. * The first argument to the function is a pointer to the comparison * function this function should use to decide if names are a match. * * If a match is found, and GFS_CACHE_VNODE is set and the vnode * exists, we simply return the existing vnode. Otherwise, we call * the static entry's callback routine, caching the result if * necessary. If the idx pointer argument is non-NULL, we use it to * return the index of the matching static entry. * * The gfs directory is expected to be locked by the caller prior to calling * this function. The directory may be unlocked during the execution of * this function, but will be locked upon return from the function. * * This function returns 0 if a match is found, ENOENT if not. */ static int gfs_dir_lookup_static(int (*compare)(const char *, const char *), gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, vnode_t **vpp, pathname_t *rpnp) { gfs_dirent_t *ge; vnode_t *vp = NULL; int i; ASSERT(GFS_DIR_LOCKED(dp)); /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (compare(ge->gfse_name, nm) == 0) { if (rpnp) (void) strlcpy(rpnp->pn_buf, ge->gfse_name, rpnp->pn_bufsize); if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); break; } /* * We drop the directory lock, as the constructor will * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set * for this entry, we discard the result in favor of * the cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); gfs_dir_lock(dp); ((gfs_file_t *)vp->v_data)->gfs_index = i; /* Set the inode according to the callback. */ ((gfs_file_t *)vp->v_data)->gfs_ino = dp->gfsd_inode(dvp, i); if (ge->gfse_flags & GFS_CACHE_VNODE) { if (ge->gfse_vnode == NULL) { ge->gfse_vnode = vp; } else { /* * A parallel constructor beat us to it; * return existing vnode. We have to be * careful because we can't release the * current vnode while holding the * directory lock; its inactive routine * will try to lock this directory. */ vnode_t *oldvp = vp; vp = ge->gfse_vnode; VN_HOLD(vp); gfs_dir_unlock(dp); VN_RELE(oldvp); gfs_dir_lock(dp); } } break; } } if (vp == NULL) return (ENOENT); else if (idx) *idx = i; *vpp = vp; return (0); } /* * gfs_dir_lookup() * * Looks up the given name in the directory and returns the corresponding * vnode, if found. * * First, we search statically defined entries, if any, with a call to * gfs_dir_lookup_static(). If no static entry is found, and we have * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). * * This function returns 0 on success, non-zero on error. */ int gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, int flags, int *direntflags, pathname_t *realpnp) { gfs_dir_t *dp = dvp->v_data; boolean_t casecheck; vnode_t *dynvp = NULL; vnode_t *vp = NULL; int (*compare)(const char *, const char *); int error, idx; ASSERT(dvp->v_type == VDIR); if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) return (0); casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || (flags & FIGNORECASE)) compare = strcasecmp; else compare = strcmp; gfs_dir_lock(dp); error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); if (vp && casecheck) { gfs_dirent_t *ge; int i; for (i = idx + 1; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (strcasecmp(ge->gfse_name, nm) == 0) { *direntflags |= ED_CASE_CONFLICT; goto out; } } } if ((error || casecheck) && dp->gfsd_lookup) error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); if (vp && dynvp) { /* static and dynamic entries are case-insensitive conflict */ ASSERT(casecheck); *direntflags |= ED_CASE_CONFLICT; VN_RELE(dynvp); } else if (vp == NULL) { vp = dynvp; } else if (error == ENOENT) { error = 0; } else if (error) { VN_RELE(vp); vp = NULL; } out: gfs_dir_unlock(dp); *vpp = vp; return (error); } /* * gfs_dir_readdir: does a readdir() on the given directory * * dvp - directory vnode * uiop - uio structure * eofp - eof pointer * data - arbitrary data passed to readdir callback * * This routine does all the readdir() dirty work. Even so, the caller must * supply two callbacks in order to get full compatibility. * * If the directory contains static entries, an inode callback must be * specified. This avoids having to create every vnode and call VOP_GETATTR() * when reading the directory. This function has the following arguments: * * ino_t gfs_inode_cb(vnode_t *vp, int index); * * vp - vnode for the directory * index - index in original gfs_dirent_t array * * Returns the inode number for the given entry. * * For directories with dynamic entries, a readdir callback must be provided. * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, * offset_t *off, offset_t *nextoff, void *data, int flags) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS * is set in 'flags'. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left * untouched. * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data * flags - VOP_READDIR flags * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr, caller_context_t *ct, int flags) { gfs_readdir_state_t gstate; int error, eof = 0; ino64_t ino, pino; offset_t off, next; gfs_dir_t *dp = dvp->v_data; error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino); if (error) return (error); if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, pino, ino, flags)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 && !eof) { if (off >= 0 && off < dp->gfsd_nstatic) { ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, off, ino, dp->gfsd_static[off].gfse_name, 0)) != 0) break; } else if (dp->gfsd_readdir) { off -= dp->gfsd_nstatic; if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, data, flags)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; next += dp->gfsd_nstatic + 2; if ((error = gfs_readdir_emit_int(&gstate, uiop, next)) != 0) break; } else { /* * Offset is beyond the end of the static entries, and * we have no dynamic entries. Set EOF. */ eof = 1; } } return (gfs_readdir_fini(&gstate, error, eofp, eof)); } /* * gfs_vop_lookup: VOP_LOOKUP() entry point * * For use directly in vnode ops table. Given a GFS directory, calls * gfs_dir_lookup() as necessary. */ /* ARGSUSED */ int gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); } /* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls * gfs_dir_readdir() as necessary. */ /* ARGSUSED */ int gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct, int flags) { return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags)); } /* * gfs_vop_map: VOP_MAP() entry point * * Convenient routine for handling pseudo-files that wish to allow mmap() calls. * This function only works for readonly files, and uses the read function for * the vnode to fill in the data. The mapped data is immediately faulted in and * filled with the necessary data during this call; there are no getpage() or * putpage() routines. */ /* ARGSUSED */ int gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, caller_context_t *ct) { int rv; ssize_t resid = len; /* * Check for bad parameters */ #ifdef _ILP32 if (len > MAXOFF_T) return (ENOMEM); #endif if (vp->v_flag & VNOMAP) return (ENOTSUP); if (off > MAXOFF_T) return (EFBIG); if ((long)off < 0 || (long)(off + len) < 0) return (EINVAL); if (vp->v_type != VREG) return (ENODEV); if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) return (EACCES); /* * Find appropriate address if needed, otherwise clear address range. */ as_rangelock(as); rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); if (rv != 0) { as_rangeunlock(as); return (rv); } /* * Create mapping */ rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); as_rangeunlock(as); if (rv != 0) return (rv); /* * Fill with data from read() */ rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, 0, (rlim64_t)0, cred, &resid); if (rv == 0 && resid != 0) rv = ENXIO; if (rv != 0) { as_rangelock(as); (void) as_unmap(as, *addrp, len); as_rangeunlock(as); } return (rv); } /* * gfs_vop_inactive: VOP_INACTIVE() entry point * * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. */ /* ARGSUSED */ void gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { gfs_file_t *fp = vp->v_data; void *data; if (fp->gfs_type == GFS_DIR) data = gfs_dir_inactive(vp); else data = gfs_file_inactive(vp); if (data != NULL) kmem_free(data, fp->gfs_size); } Index: vendor-sys/illumos/dist/uts/common/fs/vnode.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/vnode.c (revision 318932) +++ vendor-sys/illumos/dist/uts/common/fs/vnode.c (revision 318933) @@ -1,4577 +1,4579 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Determine if this vnode is a file that is read-only */ #define ISROFILE(vp) \ ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \ (vp)->v_type != VFIFO && vn_is_readonly(vp)) /* Tunable via /etc/system; used only by admin/install */ int nfs_global_client_only; /* * Array of vopstats_t for per-FS-type vopstats. This array has the same * number of entries as and parallel to the vfssw table. (Arguably, it could * be part of the vfssw table.) Once it's initialized, it's accessed using * the same fstype index that is used to index into the vfssw table. */ vopstats_t **vopstats_fstype; /* vopstats initialization template used for fast initialization via bcopy() */ static vopstats_t *vs_templatep; /* Kmem cache handle for vsk_anchor_t allocations */ kmem_cache_t *vsk_anchor_cache; /* file events cleanup routine */ extern void free_fopdata(vnode_t *); /* * Root of AVL tree for the kstats associated with vopstats. Lock protects * updates to vsktat_tree. */ avl_tree_t vskstat_tree; kmutex_t vskstat_tree_lock; /* Global variable which enables/disables the vopstats collection */ int vopstats_enabled = 1; /* * forward declarations for internal vnode specific data (vsd) */ static void *vsd_realloc(void *, size_t, size_t); /* * forward declarations for reparse point functions */ static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr); /* * VSD -- VNODE SPECIFIC DATA * The v_data pointer is typically used by a file system to store a * pointer to the file system's private node (e.g. ufs inode, nfs rnode). * However, there are times when additional project private data needs * to be stored separately from the data (node) pointed to by v_data. * This additional data could be stored by the file system itself or * by a completely different kernel entity. VSD provides a way for * callers to obtain a key and store a pointer to private data associated * with a vnode. * * Callers are responsible for protecting the vsd by holding v_vsd_lock * for calls to vsd_set() and vsd_get(). */ /* * vsd_lock protects: * vsd_nkeys - creation and deletion of vsd keys * vsd_list - insertion and deletion of vsd_node in the vsd_list * vsd_destructor - adding and removing destructors to the list */ static kmutex_t vsd_lock; static uint_t vsd_nkeys; /* size of destructor array */ /* list of vsd_node's */ static list_t *vsd_list = NULL; /* per-key destructor funcs */ static void (**vsd_destructor)(void *); /* * The following is the common set of actions needed to update the * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and * VOPSTATS_UPDATE_IO() do almost the same thing, except for the * recording of the bytes transferred. Since the code is similar * but small, it is nearly a duplicate. Consequently any changes * to one may need to be reflected in the other. * Rundown of the variables: * vp - Pointer to the vnode * counter - Partial name structure member to update in vopstats for counts * bytecounter - Partial name structure member to update in vopstats for bytes * bytesval - Value to update in vopstats for bytes * fstype - Index into vsanchor_fstype[], same as index into vfssw[] * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i]) */ #define VOPSTATS_UPDATE(vp, counter) { \ vfs_t *vfsp = (vp)->v_vfsp; \ if (vfsp && vfsp->vfs_implp && \ (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ vopstats_t *vsp = &vfsp->vfs_vopstats; \ uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ size_t, uint64_t *); \ __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \ (*stataddr)++; \ if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ vsp->n##counter.value.ui64++; \ } \ } \ } #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \ vfs_t *vfsp = (vp)->v_vfsp; \ if (vfsp && vfsp->vfs_implp && \ (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ vopstats_t *vsp = &vfsp->vfs_vopstats; \ uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ size_t, uint64_t *); \ __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \ (*stataddr)++; \ vsp->bytecounter.value.ui64 += bytesval; \ if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ vsp->n##counter.value.ui64++; \ vsp->bytecounter.value.ui64 += bytesval; \ } \ } \ } /* * If the filesystem does not support XIDs map credential * If the vfsp is NULL, perhaps we should also map? */ #define VOPXID_MAP_CR(vp, cr) { \ vfs_t *vfsp = (vp)->v_vfsp; \ if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \ cr = crgetmapped(cr); \ } /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) */ enum vtype iftovt_tab[] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; ushort_t vttoif_tab[] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0 }; /* * The system vnode cache. */ kmem_cache_t *vn_cache; /* * Vnode operations vector. */ static const fs_operation_trans_def_t vn_ops_table[] = { VOPNAME_OPEN, offsetof(struct vnodeops, vop_open), fs_nosys, fs_nosys, VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close), fs_nosys, fs_nosys, VOPNAME_READ, offsetof(struct vnodeops, vop_read), fs_nosys, fs_nosys, VOPNAME_WRITE, offsetof(struct vnodeops, vop_write), fs_nosys, fs_nosys, VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl), fs_nosys, fs_nosys, VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl), fs_setfl, fs_nosys, VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr), fs_nosys, fs_nosys, VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr), fs_nosys, fs_nosys, VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access), fs_nosys, fs_nosys, VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup), fs_nosys, fs_nosys, VOPNAME_CREATE, offsetof(struct vnodeops, vop_create), fs_nosys, fs_nosys, VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove), fs_nosys, fs_nosys, VOPNAME_LINK, offsetof(struct vnodeops, vop_link), fs_nosys, fs_nosys, VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename), fs_nosys, fs_nosys, VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir), fs_nosys, fs_nosys, VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir), fs_nosys, fs_nosys, VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir), fs_nosys, fs_nosys, VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink), fs_nosys, fs_nosys, VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink), fs_nosys, fs_nosys, VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync), fs_nosys, fs_nosys, VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive), fs_nosys, fs_nosys, VOPNAME_FID, offsetof(struct vnodeops, vop_fid), fs_nosys, fs_nosys, VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock), fs_rwlock, fs_rwlock, VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock), (fs_generic_func_p) fs_rwunlock, (fs_generic_func_p) fs_rwunlock, /* no errors allowed */ VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek), fs_nosys, fs_nosys, VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp), fs_cmp, fs_cmp, /* no errors allowed */ VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock), fs_frlock, fs_nosys, VOPNAME_SPACE, offsetof(struct vnodeops, vop_space), fs_nosys, fs_nosys, VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp), fs_nosys, fs_nosys, VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage), fs_nosys, fs_nosys, VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage), fs_nosys, fs_nosys, VOPNAME_MAP, offsetof(struct vnodeops, vop_map), (fs_generic_func_p) fs_nosys_map, (fs_generic_func_p) fs_nosys_map, VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap), (fs_generic_func_p) fs_nosys_addmap, (fs_generic_func_p) fs_nosys_addmap, VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap), fs_nosys, fs_nosys, VOPNAME_POLL, offsetof(struct vnodeops, vop_poll), (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll, VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump), fs_nosys, fs_nosys, VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf), fs_pathconf, fs_nosys, VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio), fs_nosys, fs_nosys, VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl), fs_nosys, fs_nosys, VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose), (fs_generic_func_p) fs_dispose, (fs_generic_func_p) fs_nodispose, VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr), fs_nosys, fs_nosys, VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr), fs_fab_acl, fs_nosys, VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock), fs_shrlock, fs_nosys, VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent), (fs_generic_func_p) fs_vnevent_nosupport, (fs_generic_func_p) fs_vnevent_nosupport, VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf), fs_nosys, fs_nosys, VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf), fs_nosys, fs_nosys, NULL, 0, NULL, NULL }; /* Extensible attribute (xva) routines. */ /* * Zero out the structure, set the size of the requested/returned bitmaps, * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer * to the returned attributes array. */ void xva_init(xvattr_t *xvap) { bzero(xvap, sizeof (xvattr_t)); xvap->xva_mapsize = XVA_MAPSIZE; xvap->xva_magic = XVA_MAGIC; xvap->xva_vattr.va_mask = AT_XVATTR; xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; } /* * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t * structure. Otherwise, returns NULL. */ xoptattr_t * xva_getxoptattr(xvattr_t *xvap) { xoptattr_t *xoap = NULL; if (xvap->xva_vattr.va_mask & AT_XVATTR) xoap = &xvap->xva_xoptattrs; return (xoap); } /* * Used by the AVL routines to compare two vsk_anchor_t structures in the tree. * We use the f_fsid reported by VFS_STATVFS() since we use that for the * kstat name. */ static int vska_compar(const void *n1, const void *n2) { int ret; ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid; ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid; if (p1 < p2) { ret = -1; } else if (p1 > p2) { ret = 1; } else { ret = 0; } return (ret); } /* * Used to create a single template which will be bcopy()ed to a newly * allocated vsanchor_combo_t structure in new_vsanchor(), below. */ static vopstats_t * create_vopstats_template() { vopstats_t *vsp; vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP); bzero(vsp, sizeof (*vsp)); /* Start fresh */ /* VOP_OPEN */ kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64); /* VOP_CLOSE */ kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64); /* VOP_READ I/O */ kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64); kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64); /* VOP_WRITE I/O */ kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64); kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64); /* VOP_IOCTL */ kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64); /* VOP_SETFL */ kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64); /* VOP_GETATTR */ kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64); /* VOP_SETATTR */ kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64); /* VOP_ACCESS */ kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64); /* VOP_LOOKUP */ kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64); /* VOP_CREATE */ kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64); /* VOP_REMOVE */ kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64); /* VOP_LINK */ kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64); /* VOP_RENAME */ kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64); /* VOP_MKDIR */ kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64); /* VOP_RMDIR */ kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64); /* VOP_READDIR I/O */ kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64); kstat_named_init(&vsp->readdir_bytes, "readdir_bytes", KSTAT_DATA_UINT64); /* VOP_SYMLINK */ kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64); /* VOP_READLINK */ kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64); /* VOP_FSYNC */ kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64); /* VOP_INACTIVE */ kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64); /* VOP_FID */ kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64); /* VOP_RWLOCK */ kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64); /* VOP_RWUNLOCK */ kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64); /* VOP_SEEK */ kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64); /* VOP_CMP */ kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64); /* VOP_FRLOCK */ kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64); /* VOP_SPACE */ kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64); /* VOP_REALVP */ kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64); /* VOP_GETPAGE */ kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64); /* VOP_PUTPAGE */ kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64); /* VOP_MAP */ kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64); /* VOP_ADDMAP */ kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64); /* VOP_DELMAP */ kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64); /* VOP_POLL */ kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64); /* VOP_DUMP */ kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64); /* VOP_PATHCONF */ kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64); /* VOP_PAGEIO */ kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64); /* VOP_DUMPCTL */ kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64); /* VOP_DISPOSE */ kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64); /* VOP_SETSECATTR */ kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64); /* VOP_GETSECATTR */ kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64); /* VOP_SHRLOCK */ kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64); /* VOP_VNEVENT */ kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); /* VOP_REQZCBUF */ kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64); /* VOP_RETZCBUF */ kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64); return (vsp); } /* * Creates a kstat structure associated with a vopstats structure. */ kstat_t * new_vskstat(char *ksname, vopstats_t *vsp) { kstat_t *ksp; if (!vopstats_enabled) { return (NULL); } ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED, sizeof (vopstats_t)/sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); if (ksp) { ksp->ks_data = vsp; kstat_install(ksp); } return (ksp); } /* * Called from vfsinit() to initialize the support mechanisms for vopstats */ void vopstats_startup() { if (!vopstats_enabled) return; /* * Creates the AVL tree which holds per-vfs vopstat anchors. This * is necessary since we need to check if a kstat exists before we * attempt to create it. Also, initialize its lock. */ avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t), offsetof(vsk_anchor_t, vsk_node)); mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL); vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache", sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0); /* * Set up the array of pointers for the vopstats-by-FS-type. * The entries will be allocated/initialized as each file system * goes through modload/mod_installfs. */ vopstats_fstype = (vopstats_t **)kmem_zalloc( (sizeof (vopstats_t *) * nfstype), KM_SLEEP); /* Set up the global vopstats initialization template */ vs_templatep = create_vopstats_template(); } /* * We need to have the all of the counters zeroed. * The initialization of the vopstats_t includes on the order of * 50 calls to kstat_named_init(). Rather that do that on every call, * we do it once in a template (vs_templatep) then bcopy it over. */ void initialize_vopstats(vopstats_t *vsp) { if (vsp == NULL) return; bcopy(vs_templatep, vsp, sizeof (vopstats_t)); } /* * If possible, determine which vopstats by fstype to use and * return a pointer to the caller. */ vopstats_t * get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp) { int fstype = 0; /* Index into vfssw[] */ vopstats_t *vsp = NULL; if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) return (NULL); /* * Set up the fstype. We go to so much trouble because all versions * of NFS use the same fstype in their vfs even though they have * distinct entries in the vfssw[] table. * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry. */ if (vswp) { fstype = vswp - vfssw; /* Gets us the index */ } else { fstype = vfsp->vfs_fstype; } /* * Point to the per-fstype vopstats. The only valid values are * non-zero positive values less than the number of vfssw[] table * entries. */ if (fstype > 0 && fstype < nfstype) { vsp = vopstats_fstype[fstype]; } return (vsp); } /* * Generate a kstat name, create the kstat structure, and allocate a * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t * to the caller. This must only be called from a mount. */ vsk_anchor_t * get_vskstat_anchor(vfs_t *vfsp) { char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */ statvfs64_t statvfsbuf; /* Needed to find f_fsid */ vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */ kstat_t *ksp; /* Ptr to new kstat */ avl_index_t where; /* Location in the AVL tree */ if (vfsp == NULL || vfsp->vfs_implp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) return (NULL); /* Need to get the fsid to build a kstat name */ if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) { /* Create a name for our kstats based on fsid */ (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx", VOPSTATS_STR, statvfsbuf.f_fsid); /* Allocate and initialize the vsk_anchor_t */ vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP); bzero(vskp, sizeof (*vskp)); vskp->vsk_fsid = statvfsbuf.f_fsid; mutex_enter(&vskstat_tree_lock); if (avl_find(&vskstat_tree, vskp, &where) == NULL) { avl_insert(&vskstat_tree, vskp, where); mutex_exit(&vskstat_tree_lock); /* * Now that we've got the anchor in the AVL * tree, we can create the kstat. */ ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats); if (ksp) { vskp->vsk_ksp = ksp; } } else { /* Oops, found one! Release memory and lock. */ mutex_exit(&vskstat_tree_lock); kmem_cache_free(vsk_anchor_cache, vskp); vskp = NULL; } } return (vskp); } /* * We're in the process of tearing down the vfs and need to cleanup * the data structures associated with the vopstats. Must only be called * from dounmount(). */ void teardown_vopstats(vfs_t *vfsp) { vsk_anchor_t *vskap; avl_index_t where; if (vfsp == NULL || vfsp->vfs_implp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) return; /* This is a safe check since VFS_STATS must be set (see above) */ if ((vskap = vfsp->vfs_vskap) == NULL) return; /* Whack the pointer right away */ vfsp->vfs_vskap = NULL; /* Lock the tree, remove the node, and delete the kstat */ mutex_enter(&vskstat_tree_lock); if (avl_find(&vskstat_tree, vskap, &where)) { avl_remove(&vskstat_tree, vskap); } if (vskap->vsk_ksp) { kstat_delete(vskap->vsk_ksp); } mutex_exit(&vskstat_tree_lock); kmem_cache_free(vsk_anchor_cache, vskap); } /* * Read or write a vnode. Called from kernel code. */ int vn_rdwr( enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ cred_t *cr, ssize_t *residp) { struct uio uio; struct iovec iov; int error; int in_crit = 0; if (rw == UIO_WRITE && ISROFILE(vp)) return (EROFS); if (len < 0) return (EIO); VOPXID_MAP_CR(vp, cr); iov.iov_base = base; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_loffset = offset; uio.uio_segflg = (short)seg; uio.uio_resid = len; uio.uio_llimit = ulimit; /* * We have to enter the critical region before calling VOP_RWLOCK * to avoid a deadlock with ufs. */ if (nbl_need_check(vp)) { int svmand; nbl_start_crit(vp, RW_READER); in_crit = 1; error = nbl_svmand(vp, cr, &svmand); if (error != 0) goto done; if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ, uio.uio_offset, uio.uio_resid, svmand, NULL)) { error = EACCES; goto done; } } (void) VOP_RWLOCK(vp, rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); if (rw == UIO_WRITE) { uio.uio_fmode = FWRITE; uio.uio_extflg = UIO_COPY_DEFAULT; error = VOP_WRITE(vp, &uio, ioflag, cr, NULL); } else { uio.uio_fmode = FREAD; uio.uio_extflg = UIO_COPY_CACHED; error = VOP_READ(vp, &uio, ioflag, cr, NULL); } VOP_RWUNLOCK(vp, rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); if (residp) *residp = uio.uio_resid; else if (uio.uio_resid) error = EIO; done: if (in_crit) nbl_end_crit(vp); return (error); } /* * Release a vnode. Call VOP_INACTIVE on last reference or * decrement reference count. * * To avoid race conditions, the v_count is left at 1 for * the call to VOP_INACTIVE. This prevents another thread * from reclaiming and releasing the vnode *before* the * VOP_INACTIVE routine has a chance to destroy the vnode. * We can't have more than 1 thread calling VOP_INACTIVE * on a vnode. */ void vn_rele(vnode_t *vp) { VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); if (vp->v_count == 1) { mutex_exit(&vp->v_lock); VOP_INACTIVE(vp, CRED(), NULL); return; } - vp->v_count--; + VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); } /* * Release a vnode referenced by the DNLC. Multiple DNLC references are treated * as a single reference, so v_count is not decremented until the last DNLC hold * is released. This makes it possible to distinguish vnodes that are referenced * only by the DNLC. */ void vn_rele_dnlc(vnode_t *vp) { VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); mutex_enter(&vp->v_lock); if (--vp->v_count_dnlc == 0) { if (vp->v_count == 1) { mutex_exit(&vp->v_lock); VOP_INACTIVE(vp, CRED(), NULL); return; } - vp->v_count--; + VN_RELE_LOCKED(vp); } mutex_exit(&vp->v_lock); } /* * Like vn_rele() except that it clears v_stream under v_lock. - * This is used by sockfs when it dismantels the association between - * the sockfs node and the vnode in the underlaying file system. + * This is used by sockfs when it dismantles the association between + * the sockfs node and the vnode in the underlying file system. * v_lock has to be held to prevent a thread coming through the lookupname * path from accessing a stream head that is going away. */ void vn_rele_stream(vnode_t *vp) { VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); vp->v_stream = NULL; if (vp->v_count == 1) { mutex_exit(&vp->v_lock); VOP_INACTIVE(vp, CRED(), NULL); return; } - vp->v_count--; + VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); } static void vn_rele_inactive(vnode_t *vp) { VOP_INACTIVE(vp, CRED(), NULL); } /* * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it * asynchronously using a taskq. This can avoid deadlocks caused by re-entering * the file system as a result of releasing the vnode. Note, file systems * already have to handle the race where the vnode is incremented before the * inactive routine is called and does its locking. * * Warning: Excessive use of this routine can lead to performance problems. * This is because taskqs throttle back allocation if too many are created. */ void vn_rele_async(vnode_t *vp, taskq_t *taskq) { VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); if (vp->v_count == 1) { mutex_exit(&vp->v_lock); VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive, vp, TQ_SLEEP) != NULL); return; } - vp->v_count--; + VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); } int vn_open( char *pnamep, enum uio_seg seg, int filemode, int createmode, struct vnode **vpp, enum create crwhy, mode_t umask) { return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, umask, NULL, -1)); } /* * Open/create a vnode. * This may be callable by the kernel, the only known use * of user context being that the current user credentials * are used for permissions. crwhy is defined iff filemode & FCREAT. */ int vn_openat( char *pnamep, enum uio_seg seg, int filemode, int createmode, struct vnode **vpp, enum create crwhy, mode_t umask, struct vnode *startvp, int fd) { struct vnode *vp; int mode; int accessflags; int error; int in_crit = 0; int open_done = 0; int shrlock_done = 0; struct vattr vattr; enum symfollow follow; int estale_retry = 0; struct shrlock shr; struct shr_locowner shr_own; mode = 0; accessflags = 0; if (filemode & FREAD) mode |= VREAD; if (filemode & (FWRITE|FTRUNC)) mode |= VWRITE; if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN)) mode |= VEXEC; /* symlink interpretation */ if (filemode & FNOFOLLOW) follow = NO_FOLLOW; else follow = FOLLOW; if (filemode & FAPPEND) accessflags |= V_APPEND; top: if (filemode & FCREAT) { enum vcexcl excl; /* * Wish to create a file. */ vattr.va_type = VREG; vattr.va_mode = createmode; vattr.va_mask = AT_TYPE|AT_MODE; if (filemode & FTRUNC) { vattr.va_size = 0; vattr.va_mask |= AT_SIZE; } if (filemode & FEXCL) excl = EXCL; else excl = NONEXCL; if (error = vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy, (filemode & ~(FTRUNC|FEXCL)), umask, startvp)) return (error); } else { /* * Wish to open a file. Just look it up. */ if (error = lookupnameat(pnamep, seg, follow, NULLVPP, &vp, startvp)) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; return (error); } /* * Get the attributes to check whether file is large. * We do this only if the FOFFMAX flag is not set and * only for regular files. */ if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) { vattr.va_mask = AT_SIZE; if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { goto out; } if (vattr.va_size > (u_offset_t)MAXOFF32_T) { /* * Large File API - regular open fails * if FOFFMAX flag is set in file mode */ error = EOVERFLOW; goto out; } } /* * Can't write directories, active texts, or * read-only filesystems. Can't truncate files * on which mandatory locking is in effect. */ if (filemode & (FWRITE|FTRUNC)) { /* * Allow writable directory if VDIROPEN flag is set. */ if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) { error = EISDIR; goto out; } if (ISROFILE(vp)) { error = EROFS; goto out; } /* * Can't truncate files on which * sysv mandatory locking is in effect. */ if (filemode & FTRUNC) { vnode_t *rvp; if (VOP_REALVP(vp, &rvp, NULL) != 0) rvp = vp; if (rvp->v_filocks != NULL) { vattr.va_mask = AT_MODE; if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) == 0 && MANDLOCK(vp, vattr.va_mode)) error = EAGAIN; } } if (error) goto out; } /* * Check permissions. */ if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL)) goto out; /* * Require FSEARCH to return a directory. * Require FEXEC to return a regular file. */ if ((filemode & FSEARCH) && vp->v_type != VDIR) { error = ENOTDIR; goto out; } if ((filemode & FEXEC) && vp->v_type != VREG) { error = ENOEXEC; /* XXX: error code? */ goto out; } } /* * Do remaining checks for FNOFOLLOW and FNOLINKS. */ if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { error = ELOOP; goto out; } if (filemode & FNOLINKS) { vattr.va_mask = AT_NLINK; if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { goto out; } if (vattr.va_nlink != 1) { error = EMLINK; goto out; } } /* * Opening a socket corresponding to the AF_UNIX pathname * in the filesystem name space is not supported. * However, VSOCK nodes in namefs are supported in order * to make fattach work for sockets. * * XXX This uses VOP_REALVP to distinguish between * an unopened namefs node (where VOP_REALVP returns a * different VSOCK vnode) and a VSOCK created by vn_create * in some file system (where VOP_REALVP would never return * a different vnode). */ if (vp->v_type == VSOCK) { struct vnode *nvp; error = VOP_REALVP(vp, &nvp, NULL); if (error != 0 || nvp == NULL || nvp == vp || nvp->v_type != VSOCK) { error = EOPNOTSUPP; goto out; } } if ((vp->v_type == VREG) && nbl_need_check(vp)) { /* get share reservation */ shr.s_access = 0; if (filemode & FWRITE) shr.s_access |= F_WRACC; if (filemode & FREAD) shr.s_access |= F_RDACC; shr.s_deny = 0; shr.s_sysid = 0; shr.s_pid = ttoproc(curthread)->p_pid; shr_own.sl_pid = shr.s_pid; shr_own.sl_id = fd; shr.s_own_len = sizeof (shr_own); shr.s_owner = (caddr_t)&shr_own; error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(), NULL); if (error) goto out; shrlock_done = 1; /* nbmand conflict check if truncating file */ if ((filemode & FTRUNC) && !(filemode & FCREAT)) { nbl_start_crit(vp, RW_READER); in_crit = 1; vattr.va_mask = AT_SIZE; if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) goto out; if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0, NULL)) { error = EACCES; goto out; } } } /* * Do opening protocol. */ error = VOP_OPEN(&vp, filemode, CRED(), NULL); if (error) goto out; open_done = 1; /* * Truncate if required. */ if ((filemode & FTRUNC) && !(filemode & FCREAT)) { vattr.va_size = 0; vattr.va_mask = AT_SIZE; if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) goto out; } out: ASSERT(vp->v_count > 0); if (in_crit) { nbl_end_crit(vp); in_crit = 0; } if (error) { if (open_done) { (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(), NULL); open_done = 0; shrlock_done = 0; } if (shrlock_done) { (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(), NULL); shrlock_done = 0; } /* * The following clause was added to handle a problem * with NFS consistency. It is possible that a lookup * of the file to be opened succeeded, but the file * itself doesn't actually exist on the server. This * is chiefly due to the DNLC containing an entry for * the file which has been removed on the server. In * this case, we just start over. If there was some * other cause for the ESTALE error, then the lookup * of the file will fail and the error will be returned * above instead of looping around from here. */ VN_RELE(vp); if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; } else *vpp = vp; return (error); } /* * The following two accessor functions are for the NFSv4 server. Since there * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the * vnode open counts correct when a client "upgrades" an open or does an * open_downgrade. In NFS, an upgrade or downgrade can not only change the * open mode (add or subtract read or write), but also change the share/deny * modes. However, share reservations are not integrated with OPEN, yet, so * we need to handle each separately. These functions are cleaner than having * the NFS server manipulate the counts directly, however, nobody else should * use these functions. */ void vn_open_upgrade( vnode_t *vp, int filemode) { ASSERT(vp->v_type == VREG); if (filemode & FREAD) atomic_inc_32(&vp->v_rdcnt); if (filemode & FWRITE) atomic_inc_32(&vp->v_wrcnt); } void vn_open_downgrade( vnode_t *vp, int filemode) { ASSERT(vp->v_type == VREG); if (filemode & FREAD) { ASSERT(vp->v_rdcnt > 0); atomic_dec_32(&vp->v_rdcnt); } if (filemode & FWRITE) { ASSERT(vp->v_wrcnt > 0); atomic_dec_32(&vp->v_wrcnt); } } int vn_create( char *pnamep, enum uio_seg seg, struct vattr *vap, enum vcexcl excl, int mode, struct vnode **vpp, enum create why, int flag, mode_t umask) { return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag, umask, NULL)); } /* * Create a vnode (makenode). */ int vn_createat( char *pnamep, enum uio_seg seg, struct vattr *vap, enum vcexcl excl, int mode, struct vnode **vpp, enum create why, int flag, mode_t umask, struct vnode *startvp) { struct vnode *dvp; /* ptr to parent dir vnode */ struct vnode *vp = NULL; struct pathname pn; int error; int in_crit = 0; struct vattr vattr; enum symfollow follow; int estale_retry = 0; uint32_t auditing = AU_AUDITING(); ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); /* symlink interpretation */ if ((flag & FNOFOLLOW) || excl == EXCL) follow = NO_FOLLOW; else follow = FOLLOW; flag &= ~(FNOFOLLOW|FNOLINKS); top: /* * Lookup directory. * If new object is a file, call lower level to create it. * Note that it is up to the lower level to enforce exclusive * creation, if the file is already there. * This allows the lower level to do whatever * locking or protocol that is needed to prevent races. * If the new object is directory call lower level to make * the new directory, with "." and "..". */ if (error = pn_get(pnamep, seg, &pn)) return (error); if (auditing) audit_vncreate_start(); dvp = NULL; *vpp = NULL; /* * lookup will find the parent directory for the vnode. * When it is done the pn holds the name of the entry * in the directory. * If this is a non-exclusive create we also find the node itself. */ error = lookuppnat(&pn, NULL, follow, &dvp, (excl == EXCL) ? NULLVPP : vpp, startvp); if (error) { pn_free(&pn); if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; if (why == CRMKDIR && error == EINVAL) error = EEXIST; /* SVID */ return (error); } if (why != CRMKNOD) vap->va_mode &= ~VSVTX; /* * If default ACLs are defined for the directory don't apply the * umask if umask is passed. */ if (umask) { vsecattr_t vsec; vsec.vsa_aclcnt = 0; vsec.vsa_aclentp = NULL; vsec.vsa_dfaclcnt = 0; vsec.vsa_dfaclentp = NULL; vsec.vsa_mask = VSA_DFACLCNT; error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL); /* * If error is ENOSYS then treat it as no error * Don't want to force all file systems to support * aclent_t style of ACL's. */ if (error == ENOSYS) error = 0; if (error) { if (*vpp != NULL) VN_RELE(*vpp); goto out; } else { /* * Apply the umask if no default ACLs. */ if (vsec.vsa_dfaclcnt == 0) vap->va_mode &= ~umask; /* * VOP_GETSECATTR() may have allocated memory for * ACLs we didn't request, so double-check and * free it if necessary. */ if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL) kmem_free((caddr_t)vsec.vsa_aclentp, vsec.vsa_aclcnt * sizeof (aclent_t)); if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL) kmem_free((caddr_t)vsec.vsa_dfaclentp, vsec.vsa_dfaclcnt * sizeof (aclent_t)); } } /* * In general we want to generate EROFS if the file system is * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1 * documents the open system call, and it says that O_CREAT has no * effect if the file already exists. Bug 1119649 states * that open(path, O_CREAT, ...) fails when attempting to open an * existing file on a read only file system. Thus, the first part * of the following if statement has 3 checks: * if the file exists && * it is being open with write access && * the file system is read only * then generate EROFS */ if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) || (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { if (*vpp) VN_RELE(*vpp); error = EROFS; } else if (excl == NONEXCL && *vpp != NULL) { vnode_t *rvp; /* * File already exists. If a mandatory lock has been * applied, return error. */ vp = *vpp; if (VOP_REALVP(vp, &rvp, NULL) != 0) rvp = vp; if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) { nbl_start_crit(vp, RW_READER); in_crit = 1; } if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) { vattr.va_mask = AT_MODE|AT_SIZE; if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) { goto out; } if (MANDLOCK(vp, vattr.va_mode)) { error = EAGAIN; goto out; } /* * File cannot be truncated if non-blocking mandatory * locks are currently on the file. */ if ((vap->va_mask & AT_SIZE) && in_crit) { u_offset_t offset; ssize_t length; offset = vap->va_size > vattr.va_size ? vattr.va_size : vap->va_size; length = vap->va_size > vattr.va_size ? vap->va_size - vattr.va_size : vattr.va_size - vap->va_size; if (nbl_conflict(vp, NBL_WRITE, offset, length, 0, NULL)) { error = EACCES; goto out; } } } /* * If the file is the root of a VFS, we've crossed a * mount point and the "containing" directory that we * acquired above (dvp) is irrelevant because it's in * a different file system. We apply VOP_CREATE to the * target itself instead of to the containing directory * and supply a null path name to indicate (conventionally) * the node itself as the "component" of interest. * * The intercession of the file system is necessary to * ensure that the appropriate permission checks are * done. */ if (vp->v_flag & VROOT) { ASSERT(why != CRMKDIR); error = VOP_CREATE(vp, "", vap, excl, mode, vpp, CRED(), flag, NULL, NULL); /* * If the create succeeded, it will have created * a new reference to the vnode. Give up the * original reference. The assertion should not * get triggered because NBMAND locks only apply to * VREG files. And if in_crit is non-zero for some * reason, detect that here, rather than when we * deference a null vp. */ ASSERT(in_crit == 0); VN_RELE(vp); vp = NULL; goto out; } /* * Large File API - non-large open (FOFFMAX flag not set) * of regular file fails if the file size exceeds MAXOFF32_T. */ if (why != CRMKDIR && !(flag & FOFFMAX) && (vp->v_type == VREG)) { vattr.va_mask = AT_SIZE; if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { goto out; } if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) { error = EOVERFLOW; goto out; } } } if (error == 0) { /* * Call mkdir() if specified, otherwise create(). */ int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */ if (why == CRMKDIR) /* * N.B., if vn_createat() ever requests * case-insensitive behavior then it will need * to be passed to VOP_MKDIR(). VOP_CREATE() * will already get it via "flag" */ error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(), NULL, 0, NULL); else if (!must_be_dir) error = VOP_CREATE(dvp, pn.pn_path, vap, excl, mode, vpp, CRED(), flag, NULL, NULL); else error = ENOTDIR; } out: if (auditing) audit_vncreate_finish(*vpp, error); if (in_crit) { nbl_end_crit(vp); in_crit = 0; } if (vp != NULL) { VN_RELE(vp); vp = NULL; } pn_free(&pn); VN_RELE(dvp); /* * The following clause was added to handle a problem * with NFS consistency. It is possible that a lookup * of the file to be created succeeded, but the file * itself doesn't actually exist on the server. This * is chiefly due to the DNLC containing an entry for * the file which has been removed on the server. In * this case, we just start over. If there was some * other cause for the ESTALE error, then the lookup * of the file will fail and the error will be returned * above instead of looping around from here. */ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; return (error); } int vn_link(char *from, char *to, enum uio_seg seg) { return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg)); } int vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow, vnode_t *tstartvp, char *to, enum uio_seg seg) { struct vnode *fvp; /* from vnode ptr */ struct vnode *tdvp; /* to directory vnode ptr */ struct pathname pn; int error; struct vattr vattr; dev_t fsid; int estale_retry = 0; uint32_t auditing = AU_AUDITING(); top: fvp = tdvp = NULL; if (error = pn_get(to, seg, &pn)) return (error); if (auditing && fstartvp != NULL) audit_setfsat_path(1); if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp)) goto out; if (auditing && tstartvp != NULL) audit_setfsat_path(3); if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp)) goto out; /* * Make sure both source vnode and target directory vnode are * in the same vfs and that it is writeable. */ vattr.va_mask = AT_FSID; if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL)) goto out; fsid = vattr.va_fsid; vattr.va_mask = AT_FSID; if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL)) goto out; if (fsid != vattr.va_fsid) { error = EXDEV; goto out; } if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } /* * Do the link. */ (void) pn_fixslash(&pn); error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0); out: pn_free(&pn); if (fvp) VN_RELE(fvp); if (tdvp) VN_RELE(tdvp); if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; return (error); } int vn_rename(char *from, char *to, enum uio_seg seg) { return (vn_renameat(NULL, from, NULL, to, seg)); } int vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp, char *tname, enum uio_seg seg) { int error; struct vattr vattr; struct pathname fpn; /* from pathname */ struct pathname tpn; /* to pathname */ dev_t fsid; int in_crit_src, in_crit_targ; vnode_t *fromvp, *fvp; vnode_t *tovp, *targvp; int estale_retry = 0; uint32_t auditing = AU_AUDITING(); top: fvp = fromvp = tovp = targvp = NULL; in_crit_src = in_crit_targ = 0; /* * Get to and from pathnames. */ if (error = pn_get(fname, seg, &fpn)) return (error); if (error = pn_get(tname, seg, &tpn)) { pn_free(&fpn); return (error); } /* * First we need to resolve the correct directories * The passed in directories may only be a starting point, * but we need the real directories the file(s) live in. * For example the fname may be something like usr/lib/sparc * and we were passed in the / directory, but we need to * use the lib directory for the rename. */ if (auditing && fdvp != NULL) audit_setfsat_path(1); /* * Lookup to and from directories. */ if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) { goto out; } /* * Make sure there is an entry. */ if (fvp == NULL) { error = ENOENT; goto out; } if (auditing && tdvp != NULL) audit_setfsat_path(3); if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) { goto out; } /* * Make sure both the from vnode directory and the to directory * are in the same vfs and the to directory is writable. * We check fsid's, not vfs pointers, so loopback fs works. */ if (fromvp != tovp) { vattr.va_mask = AT_FSID; if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL)) goto out; fsid = vattr.va_fsid; vattr.va_mask = AT_FSID; if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL)) goto out; if (fsid != vattr.va_fsid) { error = EXDEV; goto out; } } if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } if (targvp && (fvp != targvp)) { nbl_start_crit(targvp, RW_READER); in_crit_targ = 1; if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { error = EACCES; goto out; } } if (nbl_need_check(fvp)) { nbl_start_crit(fvp, RW_READER); in_crit_src = 1; if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) { error = EACCES; goto out; } } /* * Do the rename. */ (void) pn_fixslash(&tpn); error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(), NULL, 0); out: pn_free(&fpn); pn_free(&tpn); if (in_crit_src) nbl_end_crit(fvp); if (in_crit_targ) nbl_end_crit(targvp); if (fromvp) VN_RELE(fromvp); if (tovp) VN_RELE(tovp); if (targvp) VN_RELE(targvp); if (fvp) VN_RELE(fvp); if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; return (error); } /* * Remove a file or directory. */ int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) { return (vn_removeat(NULL, fnamep, seg, dirflag)); } int vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag) { struct vnode *vp; /* entry vnode */ struct vnode *dvp; /* ptr to parent dir vnode */ struct vnode *coveredvp; struct pathname pn; /* name of entry */ enum vtype vtype; int error; struct vfs *vfsp; struct vfs *dvfsp; /* ptr to parent dir vfs */ int in_crit = 0; int estale_retry = 0; top: if (error = pn_get(fnamep, seg, &pn)) return (error); dvp = vp = NULL; if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) { pn_free(&pn); if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; return (error); } /* * Make sure there is an entry. */ if (vp == NULL) { error = ENOENT; goto out; } vfsp = vp->v_vfsp; dvfsp = dvp->v_vfsp; /* * If the named file is the root of a mounted filesystem, fail, * unless it's marked unlinkable. In that case, unmount the * filesystem and proceed to unlink the covered vnode. (If the * covered vnode is a directory, use rmdir instead of unlink, * to avoid file system corruption.) */ if (vp->v_flag & VROOT) { if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) { error = EBUSY; goto out; } /* * Namefs specific code starts here. */ if (dirflag == RMDIRECTORY) { /* * User called rmdir(2) on a file that has * been namefs mounted on top of. Since * namefs doesn't allow directories to * be mounted on other files we know * vp is not of type VDIR so fail to operation. */ error = ENOTDIR; goto out; } /* * If VROOT is still set after grabbing vp->v_lock, * noone has finished nm_unmount so far and coveredvp * is valid. * If we manage to grab vn_vfswlock(coveredvp) before releasing * vp->v_lock, any race window is eliminated. */ mutex_enter(&vp->v_lock); if ((vp->v_flag & VROOT) == 0) { /* Someone beat us to the unmount */ mutex_exit(&vp->v_lock); error = EBUSY; goto out; } vfsp = vp->v_vfsp; coveredvp = vfsp->vfs_vnodecovered; ASSERT(coveredvp); /* * Note: Implementation of vn_vfswlock shows that ordering of * v_lock / vn_vfswlock is not an issue here. */ error = vn_vfswlock(coveredvp); mutex_exit(&vp->v_lock); if (error) goto out; VN_HOLD(coveredvp); VN_RELE(vp); error = dounmount(vfsp, 0, CRED()); /* * Unmounted the namefs file system; now get * the object it was mounted over. */ vp = coveredvp; /* * If namefs was mounted over a directory, then * we want to use rmdir() instead of unlink(). */ if (vp->v_type == VDIR) dirflag = RMDIRECTORY; if (error) goto out; } /* * Make sure filesystem is writeable. * We check the parent directory's vfs in case this is an lofs vnode. */ if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } vtype = vp->v_type; /* * If there is the possibility of an nbmand share reservation, make * sure it's okay to remove the file. Keep a reference to the * vnode, so that we can exit the nbl critical region after * calling VOP_REMOVE. * If there is no possibility of an nbmand share reservation, * release the vnode reference now. Filesystems like NFS may * behave differently if there is an extra reference, so get rid of * this one. Fortunately, we can't have nbmand mounts on NFS * filesystems. */ if (nbl_need_check(vp)) { nbl_start_crit(vp, RW_READER); in_crit = 1; if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) { error = EACCES; goto out; } } else { VN_RELE(vp); vp = NULL; } if (dirflag == RMDIRECTORY) { /* * Caller is using rmdir(2), which can only be applied to * directories. */ if (vtype != VDIR) { error = ENOTDIR; } else { vnode_t *cwd; proc_t *pp = curproc; mutex_enter(&pp->p_lock); cwd = PTOU(pp)->u_cdir; VN_HOLD(cwd); mutex_exit(&pp->p_lock); error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(), NULL, 0); VN_RELE(cwd); } } else { /* * Unlink(2) can be applied to anything. */ error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0); } out: pn_free(&pn); if (in_crit) { nbl_end_crit(vp); in_crit = 0; } if (vp != NULL) VN_RELE(vp); if (dvp != NULL) VN_RELE(dvp); if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto top; return (error); } /* * Utility function to compare equality of vnodes. * Compare the underlying real vnodes, if there are underlying vnodes. * This is a more thorough comparison than the VN_CMP() macro provides. */ int vn_compare(vnode_t *vp1, vnode_t *vp2) { vnode_t *realvp; if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0) vp1 = realvp; if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0) vp2 = realvp; return (VN_CMP(vp1, vp2)); } /* * The number of locks to hash into. This value must be a power * of 2 minus 1 and should probably also be prime. */ #define NUM_BUCKETS 1023 struct vn_vfslocks_bucket { kmutex_t vb_lock; vn_vfslocks_entry_t *vb_list; char pad[64 - sizeof (kmutex_t) - sizeof (void *)]; }; /* * Total number of buckets will be NUM_BUCKETS + 1 . */ #pragma align 64(vn_vfslocks_buckets) static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1]; #define VN_VFSLOCKS_SHIFT 9 #define VN_VFSLOCKS_HASH(vfsvpptr) \ ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS) /* * vn_vfslocks_getlock() uses an HASH scheme to generate * rwstlock using vfs/vnode pointer passed to it. * * vn_vfslocks_rele() releases a reference in the * HASH table which allows the entry allocated by * vn_vfslocks_getlock() to be freed at a later * stage when the refcount drops to zero. */ vn_vfslocks_entry_t * vn_vfslocks_getlock(void *vfsvpptr) { struct vn_vfslocks_bucket *bp; vn_vfslocks_entry_t *vep; vn_vfslocks_entry_t *tvep; ASSERT(vfsvpptr != NULL); bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)]; mutex_enter(&bp->vb_lock); for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { if (vep->ve_vpvfs == vfsvpptr) { vep->ve_refcnt++; mutex_exit(&bp->vb_lock); return (vep); } } mutex_exit(&bp->vb_lock); vep = kmem_alloc(sizeof (*vep), KM_SLEEP); rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL); vep->ve_vpvfs = (char *)vfsvpptr; vep->ve_refcnt = 1; mutex_enter(&bp->vb_lock); for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) { if (tvep->ve_vpvfs == vfsvpptr) { tvep->ve_refcnt++; mutex_exit(&bp->vb_lock); /* * There is already an entry in the hash * destroy what we just allocated. */ rwst_destroy(&vep->ve_lock); kmem_free(vep, sizeof (*vep)); return (tvep); } } vep->ve_next = bp->vb_list; bp->vb_list = vep; mutex_exit(&bp->vb_lock); return (vep); } void vn_vfslocks_rele(vn_vfslocks_entry_t *vepent) { struct vn_vfslocks_bucket *bp; vn_vfslocks_entry_t *vep; vn_vfslocks_entry_t *pvep; ASSERT(vepent != NULL); ASSERT(vepent->ve_vpvfs != NULL); bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)]; mutex_enter(&bp->vb_lock); vepent->ve_refcnt--; if ((int32_t)vepent->ve_refcnt < 0) cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative"); if (vepent->ve_refcnt == 0) { for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { if (vep->ve_vpvfs == vepent->ve_vpvfs) { if (bp->vb_list == vep) bp->vb_list = vep->ve_next; else { /* LINTED */ pvep->ve_next = vep->ve_next; } mutex_exit(&bp->vb_lock); rwst_destroy(&vep->ve_lock); kmem_free(vep, sizeof (*vep)); return; } pvep = vep; } cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found"); } mutex_exit(&bp->vb_lock); } /* * vn_vfswlock_wait is used to implement a lock which is logically a writers * lock protecting the v_vfsmountedhere field. * vn_vfswlock_wait has been modified to be similar to vn_vfswlock, * except that it blocks to acquire the lock VVFSLOCK. * * traverse() and routines re-implementing part of traverse (e.g. autofs) * need to hold this lock. mount(), vn_rename(), vn_remove() and so on * need the non-blocking version of the writers lock i.e. vn_vfswlock */ int vn_vfswlock_wait(vnode_t *vp) { int retval; vn_vfslocks_entry_t *vpvfsentry; ASSERT(vp != NULL); vpvfsentry = vn_vfslocks_getlock(vp); retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER); if (retval == EINTR) { vn_vfslocks_rele(vpvfsentry); return (EINTR); } return (retval); } int vn_vfsrlock_wait(vnode_t *vp) { int retval; vn_vfslocks_entry_t *vpvfsentry; ASSERT(vp != NULL); vpvfsentry = vn_vfslocks_getlock(vp); retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER); if (retval == EINTR) { vn_vfslocks_rele(vpvfsentry); return (EINTR); } return (retval); } /* * vn_vfswlock is used to implement a lock which is logically a writers lock * protecting the v_vfsmountedhere field. */ int vn_vfswlock(vnode_t *vp) { vn_vfslocks_entry_t *vpvfsentry; /* * If vp is NULL then somebody is trying to lock the covered vnode * of /. (vfs_vnodecovered is NULL for /). This situation will * only happen when unmounting /. Since that operation will fail * anyway, return EBUSY here instead of in VFS_UNMOUNT. */ if (vp == NULL) return (EBUSY); vpvfsentry = vn_vfslocks_getlock(vp); if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) return (0); vn_vfslocks_rele(vpvfsentry); return (EBUSY); } int vn_vfsrlock(vnode_t *vp) { vn_vfslocks_entry_t *vpvfsentry; /* * If vp is NULL then somebody is trying to lock the covered vnode * of /. (vfs_vnodecovered is NULL for /). This situation will * only happen when unmounting /. Since that operation will fail * anyway, return EBUSY here instead of in VFS_UNMOUNT. */ if (vp == NULL) return (EBUSY); vpvfsentry = vn_vfslocks_getlock(vp); if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) return (0); vn_vfslocks_rele(vpvfsentry); return (EBUSY); } void vn_vfsunlock(vnode_t *vp) { vn_vfslocks_entry_t *vpvfsentry; /* * ve_refcnt needs to be decremented twice. * 1. To release refernce after a call to vn_vfslocks_getlock() * 2. To release the reference from the locking routines like * vn_vfsrlock/vn_vfswlock etc,. */ vpvfsentry = vn_vfslocks_getlock(vp); vn_vfslocks_rele(vpvfsentry); rwst_exit(&vpvfsentry->ve_lock); vn_vfslocks_rele(vpvfsentry); } int vn_vfswlock_held(vnode_t *vp) { int held; vn_vfslocks_entry_t *vpvfsentry; ASSERT(vp != NULL); vpvfsentry = vn_vfslocks_getlock(vp); held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); vn_vfslocks_rele(vpvfsentry); return (held); } int vn_make_ops( const char *name, /* Name of file system */ const fs_operation_def_t *templ, /* Operation specification */ vnodeops_t **actual) /* Return the vnodeops */ { int unused_ops; int error; *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP); (*actual)->vnop_name = name; error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ); if (error) { kmem_free(*actual, sizeof (vnodeops_t)); } #if DEBUG if (unused_ops != 0) cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied " "but not used", name, unused_ops); #endif return (error); } /* * Free the vnodeops created as a result of vn_make_ops() */ void vn_freevnodeops(vnodeops_t *vnops) { kmem_free(vnops, sizeof (vnodeops_t)); } /* * Vnode cache. */ /* ARGSUSED */ static int vn_cache_constructor(void *buf, void *cdrarg, int kmflags) { struct vnode *vp; vp = buf; mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); vp->v_femhead = NULL; /* Must be done before vn_reinit() */ vp->v_path = NULL; vp->v_mpssdata = NULL; vp->v_vsd = NULL; vp->v_fopdata = NULL; return (0); } /* ARGSUSED */ static void vn_cache_destructor(void *buf, void *cdrarg) { struct vnode *vp; vp = buf; rw_destroy(&vp->v_nbllock); cv_destroy(&vp->v_cv); mutex_destroy(&vp->v_vsd_lock); mutex_destroy(&vp->v_lock); } void vn_create_cache(void) { /* LINTED */ ASSERT((1 << VNODE_ALIGN_LOG2) == P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN)); vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode), VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL, NULL, 0); } void vn_destroy_cache(void) { kmem_cache_destroy(vn_cache); } /* * Used by file systems when fs-specific nodes (e.g., ufs inodes) are * cached by the file system and vnodes remain associated. */ void vn_recycle(vnode_t *vp) { ASSERT(vp->v_pages == NULL); /* * XXX - This really belongs in vn_reinit(), but we have some issues * with the counts. Best to have it here for clean initialization. */ vp->v_rdcnt = 0; vp->v_wrcnt = 0; vp->v_mmap_read = 0; vp->v_mmap_write = 0; /* * If FEM was in use, make sure everything gets cleaned up * NOTE: vp->v_femhead is initialized to NULL in the vnode * constructor. */ if (vp->v_femhead) { /* XXX - There should be a free_femhead() that does all this */ ASSERT(vp->v_femhead->femh_list == NULL); mutex_destroy(&vp->v_femhead->femh_lock); kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); vp->v_femhead = NULL; } if (vp->v_path) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); vp->v_path = NULL; } if (vp->v_fopdata != NULL) { free_fopdata(vp); } vp->v_mpssdata = NULL; vsd_free(vp); } /* * Used to reset the vnode fields including those that are directly accessible * as well as those which require an accessor function. * * Does not initialize: * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv * v_data (since FS-nodes and vnodes point to each other and should * be updated simultaneously) * v_op (in case someone needs to make a VOP call on this object) */ void vn_reinit(vnode_t *vp) { vp->v_count = 1; vp->v_count_dnlc = 0; vp->v_vfsp = NULL; vp->v_stream = NULL; vp->v_vfsmountedhere = NULL; vp->v_flag = 0; vp->v_type = VNON; vp->v_rdev = NODEV; vp->v_filocks = NULL; vp->v_shrlocks = NULL; vp->v_pages = NULL; vp->v_locality = NULL; vp->v_xattrdir = NULL; /* Handles v_femhead, v_path, and the r/w/map counts */ vn_recycle(vp); } vnode_t * vn_alloc(int kmflag) { vnode_t *vp; vp = kmem_cache_alloc(vn_cache, kmflag); if (vp != NULL) { vp->v_femhead = NULL; /* Must be done before vn_reinit() */ vp->v_fopdata = NULL; vn_reinit(vp); } return (vp); } void vn_free(vnode_t *vp) { ASSERT(vp->v_shrlocks == NULL); ASSERT(vp->v_filocks == NULL); /* * Some file systems call vn_free() with v_count of zero, * some with v_count of 1. In any case, the value should * never be anything else. */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); if (vp->v_path != NULL) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); vp->v_path = NULL; } /* If FEM was in use, make sure everything gets cleaned up */ if (vp->v_femhead) { /* XXX - There should be a free_femhead() that does all this */ ASSERT(vp->v_femhead->femh_list == NULL); mutex_destroy(&vp->v_femhead->femh_lock); kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); vp->v_femhead = NULL; } if (vp->v_fopdata != NULL) { free_fopdata(vp); } vp->v_mpssdata = NULL; vsd_free(vp); kmem_cache_free(vn_cache, vp); } /* * vnode status changes, should define better states than 1, 0. */ void vn_reclaim(vnode_t *vp) { vfs_t *vfsp = vp->v_vfsp; if (vfsp == NULL || vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { return; } (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED); } void vn_idle(vnode_t *vp) { vfs_t *vfsp = vp->v_vfsp; if (vfsp == NULL || vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { return; } (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED); } void vn_exists(vnode_t *vp) { vfs_t *vfsp = vp->v_vfsp; if (vfsp == NULL || vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { return; } (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS); } void vn_invalid(vnode_t *vp) { vfs_t *vfsp = vp->v_vfsp; if (vfsp == NULL || vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { return; } (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED); } /* Vnode event notification */ int vnevent_support(vnode_t *vp, caller_context_t *ct) { if (vp == NULL) return (EINVAL); return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct)); } void vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } void vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct); } void vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); } void vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct); } void vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct); } void vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct); } void vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct); } void vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct); } void vnevent_create(vnode_t *vp, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct); } void vnevent_link(vnode_t *vp, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct); } void vnevent_mountedover(vnode_t *vp, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct); } void vnevent_truncate(vnode_t *vp, caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); } /* * Vnode accessors. */ int vn_is_readonly(vnode_t *vp) { return (vp->v_vfsp->vfs_flag & VFS_RDONLY); } int vn_has_flocks(vnode_t *vp) { return (vp->v_filocks != NULL); } int vn_has_mandatory_locks(vnode_t *vp, int mode) { return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode))); } int vn_has_cached_data(vnode_t *vp) { return (vp->v_pages != NULL); } /* * Return 0 if the vnode in question shouldn't be permitted into a zone via * zone_enter(2). */ int vn_can_change_zones(vnode_t *vp) { struct vfssw *vswp; int allow = 1; vnode_t *rvp; if (nfs_global_client_only != 0) return (1); /* * We always want to look at the underlying vnode if there is one. */ if (VOP_REALVP(vp, &rvp, NULL) != 0) rvp = vp; /* * Some pseudo filesystems (including doorfs) don't actually register * their vfsops_t, so the following may return NULL; we happily let * such vnodes switch zones. */ vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp)); if (vswp != NULL) { if (vswp->vsw_flag & VSW_NOTZONESAFE) allow = 0; vfs_unrefvfssw(vswp); } return (allow); } /* * Return nonzero if the vnode is a mount point, zero if not. */ int vn_ismntpt(vnode_t *vp) { return (vp->v_vfsmountedhere != NULL); } /* Retrieve the vfs (if any) mounted on this vnode */ vfs_t * vn_mountedvfs(vnode_t *vp) { return (vp->v_vfsmountedhere); } /* * Return nonzero if the vnode is referenced by the dnlc, zero if not. */ int vn_in_dnlc(vnode_t *vp) { return (vp->v_count_dnlc > 0); } /* * vn_has_other_opens() checks whether a particular file is opened by more than * just the caller and whether the open is for read and/or write. * This routine is for calling after the caller has already called VOP_OPEN() * and the caller wishes to know if they are the only one with it open for * the mode(s) specified. * * Vnode counts are only kept on regular files (v_type=VREG). */ int vn_has_other_opens( vnode_t *vp, v_mode_t mode) { ASSERT(vp != NULL); switch (mode) { case V_WRITE: if (vp->v_wrcnt > 1) return (V_TRUE); break; case V_RDORWR: if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1)) return (V_TRUE); break; case V_RDANDWR: if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1)) return (V_TRUE); break; case V_READ: if (vp->v_rdcnt > 1) return (V_TRUE); break; } return (V_FALSE); } /* * vn_is_opened() checks whether a particular file is opened and * whether the open is for read and/or write. * * Vnode counts are only kept on regular files (v_type=VREG). */ int vn_is_opened( vnode_t *vp, v_mode_t mode) { ASSERT(vp != NULL); switch (mode) { case V_WRITE: if (vp->v_wrcnt) return (V_TRUE); break; case V_RDANDWR: if (vp->v_rdcnt && vp->v_wrcnt) return (V_TRUE); break; case V_RDORWR: if (vp->v_rdcnt || vp->v_wrcnt) return (V_TRUE); break; case V_READ: if (vp->v_rdcnt) return (V_TRUE); break; } return (V_FALSE); } /* * vn_is_mapped() checks whether a particular file is mapped and whether * the file is mapped read and/or write. */ int vn_is_mapped( vnode_t *vp, v_mode_t mode) { ASSERT(vp != NULL); #if !defined(_LP64) switch (mode) { /* * The atomic_add_64_nv functions force atomicity in the * case of 32 bit architectures. Otherwise the 64 bit values * require two fetches. The value of the fields may be * (potentially) changed between the first fetch and the * second */ case V_WRITE: if (atomic_add_64_nv((&(vp->v_mmap_write)), 0)) return (V_TRUE); break; case V_RDANDWR: if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) && (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) return (V_TRUE); break; case V_RDORWR: if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) || (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) return (V_TRUE); break; case V_READ: if (atomic_add_64_nv((&(vp->v_mmap_read)), 0)) return (V_TRUE); break; } #else switch (mode) { case V_WRITE: if (vp->v_mmap_write) return (V_TRUE); break; case V_RDANDWR: if (vp->v_mmap_read && vp->v_mmap_write) return (V_TRUE); break; case V_RDORWR: if (vp->v_mmap_read || vp->v_mmap_write) return (V_TRUE); break; case V_READ: if (vp->v_mmap_read) return (V_TRUE); break; } #endif return (V_FALSE); } /* * Set the operations vector for a vnode. * * FEM ensures that the v_femhead pointer is filled in before the * v_op pointer is changed. This means that if the v_femhead pointer * is NULL, and the v_op field hasn't changed since before which checked * the v_femhead pointer; then our update is ok - we are not racing with * FEM. */ void vn_setops(vnode_t *vp, vnodeops_t *vnodeops) { vnodeops_t *op; ASSERT(vp != NULL); ASSERT(vnodeops != NULL); op = vp->v_op; membar_consumer(); /* * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do * the compare-and-swap on vp->v_op. If either fails, then FEM is * in effect on the vnode and we need to have FEM deal with it. */ if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) != op) { fem_setvnops(vp, vnodeops); } } /* * Retrieve the operations vector for a vnode * As with vn_setops(above); make sure we aren't racing with FEM. * FEM sets the v_op to a special, internal, vnodeops that wouldn't * make sense to the callers of this routine. */ vnodeops_t * vn_getops(vnode_t *vp) { vnodeops_t *op; ASSERT(vp != NULL); op = vp->v_op; membar_consumer(); if (vp->v_femhead == NULL && op == vp->v_op) { return (op); } else { return (fem_getvnops(vp)); } } /* * Returns non-zero (1) if the vnodeops matches that of the vnode. * Returns zero (0) if not. */ int vn_matchops(vnode_t *vp, vnodeops_t *vnodeops) { return (vn_getops(vp) == vnodeops); } /* * Returns non-zero (1) if the specified operation matches the * corresponding operation for that the vnode. * Returns zero (0) if not. */ #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0)) int vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp) { const fs_operation_trans_def_t *otdp; fs_generic_func_p *loc = NULL; vnodeops_t *vop = vn_getops(vp); ASSERT(vopname != NULL); for (otdp = vn_ops_table; otdp->name != NULL; otdp++) { if (MATCHNAME(otdp->name, vopname)) { loc = (fs_generic_func_p *) ((char *)(vop) + otdp->offset); break; } } return ((loc != NULL) && (*loc == funcp)); } /* * fs_new_caller_id() needs to return a unique ID on a given local system. * The IDs do not need to survive across reboots. These are primarily * used so that (FEM) monitors can detect particular callers (such as * the NFS server) to a given vnode/vfs operation. */ u_longlong_t fs_new_caller_id() { static uint64_t next_caller_id = 0LL; /* First call returns 1 */ return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id)); } /* * Given a starting vnode and a path, updates the path in the target vnode in * a safe manner. If the vnode already has path information embedded, then the * cached path is left untouched. */ size_t max_vnode_path = 4 * MAXPATHLEN; void vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, const char *path, size_t plen) { char *rpath; vnode_t *base; size_t rpathlen, rpathalloc; int doslash = 1; if (*path == '/') { base = rootvp; path++; plen--; } else { base = startvp; } /* * We cannot grab base->v_lock while we hold vp->v_lock because of * the potential for deadlock. */ mutex_enter(&base->v_lock); if (base->v_path == NULL) { mutex_exit(&base->v_lock); return; } rpathlen = strlen(base->v_path); rpathalloc = rpathlen + plen + 1; /* Avoid adding a slash if there's already one there */ if (base->v_path[rpathlen-1] == '/') doslash = 0; else rpathalloc++; /* * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held, * so we must do this dance. If, by chance, something changes the path, * just give up since there is no real harm. */ mutex_exit(&base->v_lock); /* Paths should stay within reason */ if (rpathalloc > max_vnode_path) return; rpath = kmem_alloc(rpathalloc, KM_SLEEP); mutex_enter(&base->v_lock); if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { mutex_exit(&base->v_lock); kmem_free(rpath, rpathalloc); return; } bcopy(base->v_path, rpath, rpathlen); mutex_exit(&base->v_lock); if (doslash) rpath[rpathlen++] = '/'; bcopy(path, rpath + rpathlen, plen); rpath[rpathlen + plen] = '\0'; mutex_enter(&vp->v_lock); if (vp->v_path != NULL) { mutex_exit(&vp->v_lock); kmem_free(rpath, rpathalloc); } else { vp->v_path = rpath; mutex_exit(&vp->v_lock); } } /* * Sets the path to the vnode to be the given string, regardless of current * context. The string must be a complete path from rootdir. This is only used * by fsop_root() for setting the path based on the mountpoint. */ void vn_setpath_str(struct vnode *vp, const char *str, size_t len) { char *buf = kmem_alloc(len + 1, KM_SLEEP); mutex_enter(&vp->v_lock); if (vp->v_path != NULL) { mutex_exit(&vp->v_lock); kmem_free(buf, len + 1); return; } vp->v_path = buf; bcopy(str, vp->v_path, len); vp->v_path[len] = '\0'; mutex_exit(&vp->v_lock); } /* * Called from within filesystem's vop_rename() to handle renames once the * target vnode is available. */ void vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) { char *tmp; mutex_enter(&vp->v_lock); tmp = vp->v_path; vp->v_path = NULL; mutex_exit(&vp->v_lock); vn_setpath(rootdir, dvp, vp, nm, len); if (tmp != NULL) kmem_free(tmp, strlen(tmp) + 1); } /* * Similar to vn_setpath_str(), this function sets the path of the destination * vnode to the be the same as the source vnode. */ void vn_copypath(struct vnode *src, struct vnode *dst) { char *buf; int alloc; mutex_enter(&src->v_lock); if (src->v_path == NULL) { mutex_exit(&src->v_lock); return; } alloc = strlen(src->v_path) + 1; /* avoid kmem_alloc() with lock held */ mutex_exit(&src->v_lock); buf = kmem_alloc(alloc, KM_SLEEP); mutex_enter(&src->v_lock); if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { mutex_exit(&src->v_lock); kmem_free(buf, alloc); return; } bcopy(src->v_path, buf, alloc); mutex_exit(&src->v_lock); mutex_enter(&dst->v_lock); if (dst->v_path != NULL) { mutex_exit(&dst->v_lock); kmem_free(buf, alloc); return; } dst->v_path = buf; mutex_exit(&dst->v_lock); } /* * XXX Private interface for segvn routines that handle vnode * large page segments. * * return 1 if vp's file system VOP_PAGEIO() implementation * can be safely used instead of VOP_GETPAGE() for handling * pagefaults against regular non swap files. VOP_PAGEIO() * interface is considered safe here if its implementation * is very close to VOP_GETPAGE() implementation. * e.g. It zero's out the part of the page beyond EOF. Doesn't * panic if there're file holes but instead returns an error. * Doesn't assume file won't be changed by user writes, etc. * * return 0 otherwise. * * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs. */ int vn_vmpss_usepageio(vnode_t *vp) { vfs_t *vfsp = vp->v_vfsp; char *fsname = vfssw[vfsp->vfs_fstype].vsw_name; char *pageio_ok_fss[] = {"ufs", "nfs", NULL}; char **fsok = pageio_ok_fss; if (fsname == NULL) { return (0); } for (; *fsok; fsok++) { if (strcmp(*fsok, fsname) == 0) { return (1); } } return (0); } /* VOP_XXX() macros call the corresponding fop_xxx() function */ int fop_open( vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct) { int ret; vnode_t *vp = *vpp; VN_HOLD(vp); /* * Adding to the vnode counts before calling open * avoids the need for a mutex. It circumvents a race * condition where a query made on the vnode counts results in a * false negative. The inquirer goes away believing the file is * not open when there is an open on the file already under way. * * The counts are meant to prevent NFS from granting a delegation * when it would be dangerous to do so. * * The vnode counts are only kept on regular files */ if ((*vpp)->v_type == VREG) { if (mode & FREAD) atomic_inc_32(&(*vpp)->v_rdcnt); if (mode & FWRITE) atomic_inc_32(&(*vpp)->v_wrcnt); } VOPXID_MAP_CR(vp, cr); ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct); if (ret) { /* * Use the saved vp just in case the vnode ptr got trashed * by the error. */ VOPSTATS_UPDATE(vp, open); if ((vp->v_type == VREG) && (mode & FREAD)) atomic_dec_32(&vp->v_rdcnt); if ((vp->v_type == VREG) && (mode & FWRITE)) atomic_dec_32(&vp->v_wrcnt); } else { /* * Some filesystems will return a different vnode, * but the same path was still used to open it. * So if we do change the vnode and need to * copy over the path, do so here, rather than special * casing each filesystem. Adjust the vnode counts to * reflect the vnode switch. */ VOPSTATS_UPDATE(*vpp, open); if (*vpp != vp && *vpp != NULL) { vn_copypath(vp, *vpp); if (((*vpp)->v_type == VREG) && (mode & FREAD)) atomic_inc_32(&(*vpp)->v_rdcnt); if ((vp->v_type == VREG) && (mode & FREAD)) atomic_dec_32(&vp->v_rdcnt); if (((*vpp)->v_type == VREG) && (mode & FWRITE)) atomic_inc_32(&(*vpp)->v_wrcnt); if ((vp->v_type == VREG) && (mode & FWRITE)) atomic_dec_32(&vp->v_wrcnt); } } VN_RELE(vp); return (ret); } int fop_close( vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct); VOPSTATS_UPDATE(vp, close); /* * Check passed in count to handle possible dups. Vnode counts are only * kept on regular files */ if ((vp->v_type == VREG) && (count == 1)) { if (flag & FREAD) { ASSERT(vp->v_rdcnt > 0); atomic_dec_32(&vp->v_rdcnt); } if (flag & FWRITE) { ASSERT(vp->v_wrcnt > 0); atomic_dec_32(&vp->v_wrcnt); } } return (err); } int fop_read( vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) { int err; ssize_t resid_start = uiop->uio_resid; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); VOPSTATS_UPDATE_IO(vp, read, read_bytes, (resid_start - uiop->uio_resid)); return (err); } int fop_write( vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) { int err; ssize_t resid_start = uiop->uio_resid; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); VOPSTATS_UPDATE_IO(vp, write, write_bytes, (resid_start - uiop->uio_resid)); return (err); } int fop_ioctl( vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct); VOPSTATS_UPDATE(vp, ioctl); return (err); } int fop_setfl( vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct); VOPSTATS_UPDATE(vp, setfl); return (err); } int fop_getattr( vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); /* * If this file system doesn't understand the xvattr extensions * then turn off the xvattr bit. */ if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { vap->va_mask &= ~AT_XVATTR; } /* * We're only allowed to skip the ACL check iff we used a 32 bit * ACE mask with VOP_ACCESS() to determine permissions. */ if ((flags & ATTR_NOACLCHECK) && vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { return (EINVAL); } err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct); VOPSTATS_UPDATE(vp, getattr); return (err); } int fop_setattr( vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); /* * If this file system doesn't understand the xvattr extensions * then turn off the xvattr bit. */ if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { vap->va_mask &= ~AT_XVATTR; } /* * We're only allowed to skip the ACL check iff we used a 32 bit * ACE mask with VOP_ACCESS() to determine permissions. */ if ((flags & ATTR_NOACLCHECK) && vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { return (EINVAL); } err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct); VOPSTATS_UPDATE(vp, setattr); return (err); } int fop_access( vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) { int err; if ((flags & V_ACE_MASK) && vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { return (EINVAL); } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct); VOPSTATS_UPDATE(vp, access); return (err); } int fop_lookup( vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *deflags, /* Returned per-dirent flags */ pathname_t *ppnp) /* Returned case-preserved name in directory */ { int ret; /* * If this file system doesn't support case-insensitive access * and said access is requested, fail quickly. It is required * that if the vfs supports case-insensitive lookup, it also * supports extended dirent flags. */ if (flags & FIGNORECASE && (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(dvp, cr); if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) { ret = xattr_dir_lookup(dvp, vpp, flags, cr); } else { ret = (*(dvp)->v_op->vop_lookup) (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp); } if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, lookup); if ((*vpp)->v_path == NULL) { vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); } } return (ret); } int fop_create( vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, vsecattr_t *vsecp) /* ACL to set during create */ { int ret; if (vsecp != NULL && vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { return (EINVAL); } /* * If this file system doesn't support case-insensitive access * and said access is requested, fail quickly. */ if (flags & FIGNORECASE && (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(dvp, cr); ret = (*(dvp)->v_op->vop_create) (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, create); if ((*vpp)->v_path == NULL) { vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); } } return (ret); } int fop_remove( vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) { int err; /* * If this file system doesn't support case-insensitive access * and said access is requested, fail quickly. */ if (flags & FIGNORECASE && (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(dvp, cr); err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags); VOPSTATS_UPDATE(dvp, remove); return (err); } int fop_link( vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { int err; /* * If the target file system doesn't support case-insensitive access * and said access is requested, fail quickly. */ if (flags & FIGNORECASE && (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(tdvp, cr); err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags); VOPSTATS_UPDATE(tdvp, link); return (err); } int fop_rename( vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { int err; /* * If the file system involved does not support * case-insensitive access and said access is requested, fail * quickly. */ if (flags & FIGNORECASE && ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))) return (EINVAL); VOPXID_MAP_CR(tdvp, cr); err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags); VOPSTATS_UPDATE(sdvp, rename); return (err); } int fop_mkdir( vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) /* ACL to set during create */ { int ret; if (vsecp != NULL && vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { return (EINVAL); } /* * If this file system doesn't support case-insensitive access * and said access is requested, fail quickly. */ if (flags & FIGNORECASE && (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(dvp, cr); ret = (*(dvp)->v_op->vop_mkdir) (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, mkdir); if ((*vpp)->v_path == NULL) { vn_setpath(rootdir, dvp, *vpp, dirname, strlen(dirname)); } } return (ret); } int fop_rmdir( vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, caller_context_t *ct, int flags) { int err; /* * If this file system doesn't support case-insensitive access * and said access is requested, fail quickly. */ if (flags & FIGNORECASE && (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(dvp, cr); err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags); VOPSTATS_UPDATE(dvp, rmdir); return (err); } int fop_readdir( vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct, int flags) { int err; ssize_t resid_start = uiop->uio_resid; /* * If this file system doesn't support retrieving directory * entry flags and said access is requested, fail quickly. */ if (flags & V_RDDIR_ENTFLAGS && vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0) return (EINVAL); VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags); VOPSTATS_UPDATE_IO(vp, readdir, readdir_bytes, (resid_start - uiop->uio_resid)); return (err); } int fop_symlink( vnode_t *dvp, char *linkname, vattr_t *vap, char *target, cred_t *cr, caller_context_t *ct, int flags) { int err; xvattr_t xvattr; /* * If this file system doesn't support case-insensitive access * and said access is requested, fail quickly. */ if (flags & FIGNORECASE && (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) return (EINVAL); VOPXID_MAP_CR(dvp, cr); /* check for reparse point */ if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) && (strncmp(target, FS_REPARSE_TAG_STR, strlen(FS_REPARSE_TAG_STR)) == 0)) { if (!fs_reparse_mark(target, vap, &xvattr)) vap = (vattr_t *)&xvattr; } err = (*(dvp)->v_op->vop_symlink) (dvp, linkname, vap, target, cr, ct, flags); VOPSTATS_UPDATE(dvp, symlink); return (err); } int fop_readlink( vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct); VOPSTATS_UPDATE(vp, readlink); return (err); } int fop_fsync( vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct); VOPSTATS_UPDATE(vp, fsync); return (err); } void fop_inactive( vnode_t *vp, cred_t *cr, caller_context_t *ct) { /* Need to update stats before vop call since we may lose the vnode */ VOPSTATS_UPDATE(vp, inactive); VOPXID_MAP_CR(vp, cr); (*(vp)->v_op->vop_inactive)(vp, cr, ct); } int fop_fid( vnode_t *vp, fid_t *fidp, caller_context_t *ct) { int err; err = (*(vp)->v_op->vop_fid)(vp, fidp, ct); VOPSTATS_UPDATE(vp, fid); return (err); } int fop_rwlock( vnode_t *vp, int write_lock, caller_context_t *ct) { int ret; ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct)); VOPSTATS_UPDATE(vp, rwlock); return (ret); } void fop_rwunlock( vnode_t *vp, int write_lock, caller_context_t *ct) { (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct); VOPSTATS_UPDATE(vp, rwunlock); } int fop_seek( vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) { int err; err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct); VOPSTATS_UPDATE(vp, seek); return (err); } int fop_cmp( vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) { int err; err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct); VOPSTATS_UPDATE(vp1, cmp); return (err); } int fop_frlock( vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_frlock) (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); VOPSTATS_UPDATE(vp, frlock); return (err); } int fop_space( vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct); VOPSTATS_UPDATE(vp, space); return (err); } int fop_realvp( vnode_t *vp, vnode_t **vpp, caller_context_t *ct) { int err; err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct); VOPSTATS_UPDATE(vp, realvp); return (err); } int fop_getpage( vnode_t *vp, offset_t off, size_t len, uint_t *protp, page_t **plarr, size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_getpage) (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct); VOPSTATS_UPDATE(vp, getpage); return (err); } int fop_putpage( vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct); VOPSTATS_UPDATE(vp, putpage); return (err); } int fop_map( vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_map) (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct); VOPSTATS_UPDATE(vp, map); return (err); } int fop_addmap( vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { int error; u_longlong_t delta; VOPXID_MAP_CR(vp, cr); error = (*(vp)->v_op->vop_addmap) (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); if ((!error) && (vp->v_type == VREG)) { delta = (u_longlong_t)btopr(len); /* * If file is declared MAP_PRIVATE, it can't be written back * even if open for write. Handle as read. */ if (flags & MAP_PRIVATE) { atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), (int64_t)delta); } else { /* * atomic_add_64 forces the fetch of a 64 bit value to * be atomic on 32 bit machines */ if (maxprot & PROT_WRITE) atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), (int64_t)delta); if (maxprot & PROT_READ) atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), (int64_t)delta); if (maxprot & PROT_EXEC) atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), (int64_t)delta); } } VOPSTATS_UPDATE(vp, addmap); return (error); } int fop_delmap( vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { int error; u_longlong_t delta; VOPXID_MAP_CR(vp, cr); error = (*(vp)->v_op->vop_delmap) (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); /* * NFS calls into delmap twice, the first time * it simply establishes a callback mechanism and returns EAGAIN * while the real work is being done upon the second invocation. * We have to detect this here and only decrement the counts upon * the second delmap request. */ if ((error != EAGAIN) && (vp->v_type == VREG)) { delta = (u_longlong_t)btopr(len); if (flags & MAP_PRIVATE) { atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), (int64_t)(-delta)); } else { /* * atomic_add_64 forces the fetch of a 64 bit value * to be atomic on 32 bit machines */ if (maxprot & PROT_WRITE) atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), (int64_t)(-delta)); if (maxprot & PROT_READ) atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), (int64_t)(-delta)); if (maxprot & PROT_EXEC) atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), (int64_t)(-delta)); } } VOPSTATS_UPDATE(vp, delmap); return (error); } int fop_poll( vnode_t *vp, short events, int anyyet, short *reventsp, struct pollhead **phpp, caller_context_t *ct) { int err; err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct); VOPSTATS_UPDATE(vp, poll); return (err); } int fop_dump( vnode_t *vp, caddr_t addr, offset_t lbdn, offset_t dblks, caller_context_t *ct) { int err; /* ensure lbdn and dblks can be passed safely to bdev_dump */ if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks)) return (EIO); err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct); VOPSTATS_UPDATE(vp, dump); return (err); } int fop_pathconf( vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct); VOPSTATS_UPDATE(vp, pathconf); return (err); } int fop_pageio( vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len, int flags, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct); VOPSTATS_UPDATE(vp, pageio); return (err); } int fop_dumpctl( vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct) { int err; err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct); VOPSTATS_UPDATE(vp, dumpctl); return (err); } void fop_dispose( vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr, caller_context_t *ct) { /* Must do stats first since it's possible to lose the vnode */ VOPSTATS_UPDATE(vp, dispose); VOPXID_MAP_CR(vp, cr); (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct); } int fop_setsecattr( vnode_t *vp, vsecattr_t *vsap, int flag, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); /* * We're only allowed to skip the ACL check iff we used a 32 bit * ACE mask with VOP_ACCESS() to determine permissions. */ if ((flag & ATTR_NOACLCHECK) && vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { return (EINVAL); } err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct); VOPSTATS_UPDATE(vp, setsecattr); return (err); } int fop_getsecattr( vnode_t *vp, vsecattr_t *vsap, int flag, cred_t *cr, caller_context_t *ct) { int err; /* * We're only allowed to skip the ACL check iff we used a 32 bit * ACE mask with VOP_ACCESS() to determine permissions. */ if ((flag & ATTR_NOACLCHECK) && vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { return (EINVAL); } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct); VOPSTATS_UPDATE(vp, getsecattr); return (err); } int fop_shrlock( vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, caller_context_t *ct) { int err; VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct); VOPSTATS_UPDATE(vp, shrlock); return (err); } int fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm, caller_context_t *ct) { int err; err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct); VOPSTATS_UPDATE(vp, vnevent); return (err); } int fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr, caller_context_t *ct) { int err; if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) return (ENOTSUP); err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct); VOPSTATS_UPDATE(vp, reqzcbuf); return (err); } int fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct) { int err; if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) return (ENOTSUP); err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct); VOPSTATS_UPDATE(vp, retzcbuf); return (err); } /* * Default destructor * Needed because NULL destructor means that the key is unused */ /* ARGSUSED */ void vsd_defaultdestructor(void *value) {} /* * Create a key (index into per vnode array) * Locks out vsd_create, vsd_destroy, and vsd_free * May allocate memory with lock held */ void vsd_create(uint_t *keyp, void (*destructor)(void *)) { int i; uint_t nkeys; /* * if key is allocated, do nothing */ mutex_enter(&vsd_lock); if (*keyp) { mutex_exit(&vsd_lock); return; } /* * find an unused key */ if (destructor == NULL) destructor = vsd_defaultdestructor; for (i = 0; i < vsd_nkeys; ++i) if (vsd_destructor[i] == NULL) break; /* * if no unused keys, increase the size of the destructor array */ if (i == vsd_nkeys) { if ((nkeys = (vsd_nkeys << 1)) == 0) nkeys = 1; vsd_destructor = (void (**)(void *))vsd_realloc((void *)vsd_destructor, (size_t)(vsd_nkeys * sizeof (void (*)(void *))), (size_t)(nkeys * sizeof (void (*)(void *)))); vsd_nkeys = nkeys; } /* * allocate the next available unused key */ vsd_destructor[i] = destructor; *keyp = i + 1; /* create vsd_list, if it doesn't exist */ if (vsd_list == NULL) { vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(vsd_list, sizeof (struct vsd_node), offsetof(struct vsd_node, vs_nodes)); } mutex_exit(&vsd_lock); } /* * Destroy a key * * Assumes that the caller is preventing vsd_set and vsd_get * Locks out vsd_create, vsd_destroy, and vsd_free * May free memory with lock held */ void vsd_destroy(uint_t *keyp) { uint_t key; struct vsd_node *vsd; /* * protect the key namespace and our destructor lists */ mutex_enter(&vsd_lock); key = *keyp; *keyp = 0; ASSERT(key <= vsd_nkeys); /* * if the key is valid */ if (key != 0) { uint_t k = key - 1; /* * for every vnode with VSD, call key's destructor */ for (vsd = list_head(vsd_list); vsd != NULL; vsd = list_next(vsd_list, vsd)) { /* * no VSD for key in this vnode */ if (key > vsd->vs_nkeys) continue; /* * call destructor for key */ if (vsd->vs_value[k] && vsd_destructor[k]) (*vsd_destructor[k])(vsd->vs_value[k]); /* * reset value for key */ vsd->vs_value[k] = NULL; } /* * actually free the key (NULL destructor == unused) */ vsd_destructor[k] = NULL; } mutex_exit(&vsd_lock); } /* * Quickly return the per vnode value that was stored with the specified key * Assumes the caller is protecting key from vsd_create and vsd_destroy * Assumes the caller is holding v_vsd_lock to protect the vsd. */ void * vsd_get(vnode_t *vp, uint_t key) { struct vsd_node *vsd; ASSERT(vp != NULL); ASSERT(mutex_owned(&vp->v_vsd_lock)); vsd = vp->v_vsd; if (key && vsd != NULL && key <= vsd->vs_nkeys) return (vsd->vs_value[key - 1]); return (NULL); } /* * Set a per vnode value indexed with the specified key * Assumes the caller is holding v_vsd_lock to protect the vsd. */ int vsd_set(vnode_t *vp, uint_t key, void *value) { struct vsd_node *vsd; ASSERT(vp != NULL); ASSERT(mutex_owned(&vp->v_vsd_lock)); if (key == 0) return (EINVAL); vsd = vp->v_vsd; if (vsd == NULL) vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP); /* * If the vsd was just allocated, vs_nkeys will be 0, so the following * code won't happen and we will continue down and allocate space for * the vs_value array. * If the caller is replacing one value with another, then it is up * to the caller to free/rele/destroy the previous value (if needed). */ if (key <= vsd->vs_nkeys) { vsd->vs_value[key - 1] = value; return (0); } ASSERT(key <= vsd_nkeys); if (vsd->vs_nkeys == 0) { mutex_enter(&vsd_lock); /* lock out vsd_destroy() */ /* * Link onto list of all VSD nodes. */ list_insert_head(vsd_list, vsd); mutex_exit(&vsd_lock); } /* * Allocate vnode local storage and set the value for key */ vsd->vs_value = vsd_realloc(vsd->vs_value, vsd->vs_nkeys * sizeof (void *), key * sizeof (void *)); vsd->vs_nkeys = key; vsd->vs_value[key - 1] = value; return (0); } /* * Called from vn_free() to run the destructor function for each vsd * Locks out vsd_create and vsd_destroy * Assumes that the destructor *DOES NOT* use vsd */ void vsd_free(vnode_t *vp) { int i; struct vsd_node *vsd = vp->v_vsd; if (vsd == NULL) return; if (vsd->vs_nkeys == 0) { kmem_free(vsd, sizeof (*vsd)); vp->v_vsd = NULL; return; } /* * lock out vsd_create and vsd_destroy, call * the destructor, and mark the value as destroyed. */ mutex_enter(&vsd_lock); for (i = 0; i < vsd->vs_nkeys; i++) { if (vsd->vs_value[i] && vsd_destructor[i]) (*vsd_destructor[i])(vsd->vs_value[i]); vsd->vs_value[i] = NULL; } /* * remove from linked list of VSD nodes */ list_remove(vsd_list, vsd); mutex_exit(&vsd_lock); /* * free up the VSD */ kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *)); kmem_free(vsd, sizeof (struct vsd_node)); vp->v_vsd = NULL; } /* * realloc */ static void * vsd_realloc(void *old, size_t osize, size_t nsize) { void *new; new = kmem_zalloc(nsize, KM_SLEEP); if (old) { bcopy(old, new, osize); kmem_free(old, osize); } return (new); } /* * Setup the extensible system attribute for creating a reparse point. * The symlink data 'target' is validated for proper format of a reparse * string and a check also made to make sure the symlink data does not * point to an existing file. * * return 0 if ok else -1. */ static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr) { xoptattr_t *xoap; if ((!target) || (!vap) || (!xvattr)) return (-1); /* validate reparse string */ if (reparse_validate((const char *)target)) return (-1); xva_init(xvattr); xvattr->xva_vattr = *vap; xvattr->xva_vattr.va_mask |= AT_XVATTR; xoap = xva_getxoptattr(xvattr); ASSERT(xoap); XVA_SET_REQ(xvattr, XAT_REPARSE); xoap->xoa_reparse = 1; return (0); } /* * Function to check whether a symlink is a reparse point. * Return B_TRUE if it is a reparse point, else return B_FALSE */ boolean_t vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct) { xvattr_t xvattr; xoptattr_t *xoap; if ((vp->v_type != VLNK) || !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR))) return (B_FALSE); xva_init(&xvattr); xoap = xva_getxoptattr(&xvattr); ASSERT(xoap); XVA_SET_REQ(&xvattr, XAT_REPARSE); if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct)) return (B_FALSE); if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) || (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE)))) return (B_FALSE); return (xoap->xoa_reparse ? B_TRUE : B_FALSE); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ctldir.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ctldir.c (revision 318932) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ctldir.c (revision 318933) @@ -1,1361 +1,1361 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. */ /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' directory, but this may expand in the * future. The elements are built using the GFS primitives, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding vnode. * * All mounts are handled automatically by the kernel, but unmounts are * (currently) handled from user land. The main reason is that there is no * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. * * The '.zfs', '.zfs/snapshot', and all directories created under * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and * share the same vfs_t as the head filesystem (what '.zfs' lives under). * * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. * However, vnodes within these mounted on file systems have their v_vfsp * fields set to the head filesystem to make NFS happy (see * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t * so that it cannot be freed until all snapshots have been unmounted. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" typedef struct zfsctl_node { gfs_dir_t zc_gfs_private; uint64_t zc_id; timestruc_t zc_cmtime; /* ctime and mtime, always the same */ } zfsctl_node_t; typedef struct zfsctl_snapdir { zfsctl_node_t sd_node; kmutex_t sd_lock; avl_tree_t sd_snaps; } zfsctl_snapdir_t; typedef struct { char *se_name; vnode_t *se_root; avl_node_t se_node; } zfs_snapentry_t; static int snapentry_compare(const void *a, const void *b) { const zfs_snapentry_t *sa = a; const zfs_snapentry_t *sb = b; int ret = strcmp(sa->se_name, sb->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } vnodeops_t *zfsctl_ops_root; vnodeops_t *zfsctl_ops_snapdir; vnodeops_t *zfsctl_ops_snapshot; vnodeops_t *zfsctl_ops_shares; static const fs_operation_def_t zfsctl_tops_root[]; static const fs_operation_def_t zfsctl_tops_snapdir[]; static const fs_operation_def_t zfsctl_tops_snapshot[]; static const fs_operation_def_t zfsctl_tops_shares[]; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_mknode_shares(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); static gfs_opsvec_t zfsctl_opsvec[] = { { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares }, { NULL } }; /* * Root directory elements. We only have two entries * snapshot and shares. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, { NULL } }; /* include . and .. in the calculation */ #define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ sizeof (gfs_dirent_t)) + 1) /* * Initialize the various GFS pieces we'll need to create and manipulate .zfs * directories. This is called from the ZFS init routine, and initializes the * vnode ops vectors that we'll be using. */ void zfsctl_init(void) { VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); } void zfsctl_fini(void) { /* * Remove vfsctl vnode ops */ if (zfsctl_ops_root) vn_freevnodeops(zfsctl_ops_root); if (zfsctl_ops_snapdir) vn_freevnodeops(zfsctl_ops_snapdir); if (zfsctl_ops_snapshot) vn_freevnodeops(zfsctl_ops_snapshot); if (zfsctl_ops_shares) vn_freevnodeops(zfsctl_ops_shares); zfsctl_ops_root = NULL; zfsctl_ops_snapdir = NULL; zfsctl_ops_snapshot = NULL; zfsctl_ops_shares = NULL; } boolean_t zfsctl_is_node(vnode_t *vp) { return (vn_matchops(vp, zfsctl_ops_root) || vn_matchops(vp, zfsctl_ops_snapdir) || vn_matchops(vp, zfsctl_ops_snapshot) || vn_matchops(vp, zfsctl_ops_shares)); } /* * Return the inode number associated with the 'snapshot' or * 'shares' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; ASSERT(index < 2); if (index == 0) return (ZFSCTL_INO_SNAPDIR); return (zfsvfs->z_shares_dir); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the vfs_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. */ void zfsctl_create(zfsvfs_t *zfsvfs) { vnode_t *vp, *rvp; zfsctl_node_t *zcp; uint64_t crtime[2]; ASSERT(zfsvfs->z_ctldir == NULL); vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = ZFSCTL_INO_ROOT; VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0); VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), &crtime, sizeof (crtime))); ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime); VN_RELE(rvp); /* * We're only faking the fact that we have a root of a filesystem for * the sake of the GFS interfaces. Undo the flag manipulation it did * for us. */ vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT); zfsvfs->z_ctldir = vp; } /* * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. * There might still be more references if we were force unmounted, but only * new zfs_inactive() calls can occur and they don't reference .zfs */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { VN_RELE(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ vnode_t * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); VN_HOLD(zp->z_zfsvfs->z_ctldir); return (zp->z_zfsvfs->z_ctldir); } /* * Common open routine. Disallow any write access. */ /* ARGSUSED */ static int zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct) { if (flags & FWRITE) return (SET_ERROR(EACCES)); return (0); } /* * Common close routine. Nothing to do here. */ /* ARGSUSED */ static int zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off, cred_t *cr, caller_context_t *ct) { return (0); } /* * Common access routine. Disallow writes. */ /* ARGSUSED */ static int zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) { if (flags & V_ACE_MASK) { if (mode & ACE_ALL_WRITE_PERMS) return (SET_ERROR(EACCES)); } else { if (mode & VWRITE) return (SET_ERROR(EACCES)); } return (0); } /* * Common getattr function. Fill in basic information. */ static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { timestruc_t now; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* * We are a purely virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; vap->va_nblocks = 0; vap->va_seq = 0; vap->va_fsid = vp->v_vfsp->vfs_dev; vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; vap->va_type = VDIR; /* * We live in the now (for atime). */ gethrestime(&now); vap->va_atime = now; } /*ARGSUSED*/ static int zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_node_t *zcp = vp->v_data; uint64_t object = zcp->zc_id; zfid_short_t *zfid; int i; ZFS_ENTER(zfsvfs); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ static int zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_FID(ZTOV(dzp), fidp, ct); VN_RELE(ZTOV(dzp)); } ZFS_EXIT(zfsvfs); return (error); } /* * .zfs inode namespace * * We need to generate unique inode numbers for all files and directories * within the .zfs pseudo-filesystem. We use the following scheme: * * ENTRY ZFSCTL_INODE * .zfs 1 * .zfs/snapshot 2 * .zfs/snapshot/ objectid(snap) */ #define ZFSCTL_INO_SNAP(id) (id) /* * Get root directory attributes. */ /* ARGSUSED */ static int zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_node_t *zcp = vp->v_data; ZFS_ENTER(zfsvfs); vap->va_nodeid = ZFSCTL_INO_ROOT; vap->va_nlink = vap->va_size = NROOT_ENTRIES; vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; zfsctl_common_getattr(vp, vap); ZFS_EXIT(zfsvfs); return (0); } /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; /* * No extended attributes allowed under .zfs */ if (flags & LOOKUP_XATTR) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); if (strcmp(nm, "..") == 0) { err = VFS_ROOT(dvp->v_vfsp, vpp); } else { err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, ct, direntflags, realpnp); } ZFS_EXIT(zfsvfs); return (err); } static int zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { /* * We only care about ACL_ENABLED so that libsec can * display ACL correctly and not default to POSIX draft. */ if (cmd == _PC_ACL_ENABLED) { *valp = _ACL_ACE_ENABLED; return (0); } return (fs_pathconf(vp, cmd, valp, cr, ct)); } static const fs_operation_def_t zfsctl_tops_root[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_IOCTL, { .error = fs_inval } }, { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { NULL } }; /* * Gets the full dataset name that corresponds to the given snapshot name * Example: * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" */ static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; if (zfs_component_namecheck(name, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(os, zname); if (strlen(zname) + 1 + strlen(name) >= len) return (SET_ERROR(ENAMETOOLONG)); (void) strcat(zname, "@"); (void) strcat(zname, name); return (0); } static int zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) { vnode_t *svp = sep->se_root; int error; ASSERT(vn_ismntpt(svp)); /* this will be dropped by dounmount() */ if ((error = vn_vfswlock(svp)) != 0) return (error); VN_HOLD(svp); error = dounmount(vn_mountedvfs(svp), fflags, cr); if (error) { VN_RELE(svp); return (error); } /* * We can't use VN_RELE(), as that will try to invoke * zfsctl_snapdir_inactive(), which would cause us to destroy * the sd_lock mutex held by our caller. */ ASSERT(svp->v_count == 1); gfs_vop_inactive(svp, cr, NULL); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); return (0); } static void zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) { avl_index_t where; vfs_t *vfsp; refstr_t *pathref; char newpath[MAXNAMELEN]; char *tail; ASSERT(MUTEX_HELD(&sdp->sd_lock)); ASSERT(sep != NULL); vfsp = vn_mountedvfs(sep->se_root); ASSERT(vfsp != NULL); vfs_lock_wait(vfsp); /* * Change the name in the AVL tree. */ avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); avl_insert(&sdp->sd_snaps, sep, where); /* * Change the current mountpoint info: * - update the tail of the mntpoint path * - update the tail of the resource path */ pathref = vfs_getmntpoint(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '/')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setmntpoint(vfsp, newpath, 0); pathref = vfs_getresource(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '@')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setresource(vfsp, newpath, 0); vfs_unlock(vfsp); } /*ARGSUSED*/ static int zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { zfsctl_snapdir_t *sdp = sdvp->v_data; zfs_snapentry_t search, *sep; zfsvfs_t *zfsvfs; avl_index_t where; char from[ZFS_MAX_DATASET_NAME_LEN], to[ZFS_MAX_DATASET_NAME_LEN]; char real[ZFS_MAX_DATASET_NAME_LEN], fsname[ZFS_MAX_DATASET_NAME_LEN]; int err; zfsvfs = sdvp->v_vfsp->vfs_data; ZFS_ENTER(zfsvfs); if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, sizeof (real), NULL); if (err == 0) { snm = real; } else if (err != ENOTSUP) { ZFS_EXIT(zfsvfs); return (err); } } ZFS_EXIT(zfsvfs); dmu_objset_name(zfsvfs->z_os, fsname); err = zfsctl_snapshot_zname(sdvp, snm, sizeof (from), from); if (err == 0) err = zfsctl_snapshot_zname(tdvp, tnm, sizeof (to), to); if (err == 0) err = zfs_secpolicy_rename_perms(from, to, cr); if (err != 0) return (err); /* * Cannot move snapshots out of the snapdir. */ if (sdvp != tdvp) return (SET_ERROR(EINVAL)); if (strcmp(snm, tnm) == 0) return (0); mutex_enter(&sdp->sd_lock); search.se_name = (char *)snm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { mutex_exit(&sdp->sd_lock); return (SET_ERROR(ENOENT)); } err = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); if (err == 0) zfsctl_rename_snap(sdp, sep, tnm); mutex_exit(&sdp->sd_lock); return (err); } /* ARGSUSED */ static int zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, caller_context_t *ct, int flags) { zfsctl_snapdir_t *sdp = dvp->v_data; zfs_snapentry_t *sep; zfs_snapentry_t search; zfsvfs_t *zfsvfs; char snapname[ZFS_MAX_DATASET_NAME_LEN]; char real[ZFS_MAX_DATASET_NAME_LEN]; int err; zfsvfs = dvp->v_vfsp->vfs_data; ZFS_ENTER(zfsvfs); if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { err = dmu_snapshot_realname(zfsvfs->z_os, name, real, sizeof (real), NULL); if (err == 0) { name = real; } else if (err != ENOTSUP) { ZFS_EXIT(zfsvfs); return (err); } } ZFS_EXIT(zfsvfs); err = zfsctl_snapshot_zname(dvp, name, sizeof (snapname), snapname); if (err == 0) err = zfs_secpolicy_destroy_perms(snapname, cr); if (err != 0) return (err); mutex_enter(&sdp->sd_lock); search.se_name = name; sep = avl_find(&sdp->sd_snaps, &search, NULL); if (sep) { avl_remove(&sdp->sd_snaps, sep); err = zfsctl_unmount_snap(sep, MS_FORCE, cr); if (err != 0) avl_add(&sdp->sd_snaps, sep); else err = dsl_destroy_snapshot(snapname, B_FALSE); } else { err = SET_ERROR(ENOENT); } mutex_exit(&sdp->sd_lock); return (err); } /* * This creates a snapshot under '.zfs/snapshot'. */ /* ARGSUSED */ static int zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; char name[ZFS_MAX_DATASET_NAME_LEN]; int err; static enum symfollow follow = NO_FOLLOW; static enum uio_seg seg = UIO_SYSSPACE; if (zfs_component_namecheck(dirname, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(zfsvfs->z_os, name); *vpp = NULL; err = zfs_secpolicy_snapshot_perms(name, cr); if (err != 0) return (err); if (err == 0) { err = dmu_objset_snapshot_one(name, dirname); if (err != 0) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); } return (err); } /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem vnode as necessary. * Perform a mount of the associated dataset on top of the vnode. */ /* ARGSUSED */ static int zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { zfsctl_snapdir_t *sdp = dvp->v_data; objset_t *snap; char snapname[ZFS_MAX_DATASET_NAME_LEN]; char real[ZFS_MAX_DATASET_NAME_LEN]; char *mountpoint; zfs_snapentry_t *sep, search; struct mounta margs; vfs_t *vfsp; size_t mountpoint_len; avl_index_t where; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; /* * No extended attributes allowed under .zfs */ if (flags & LOOKUP_XATTR) return (SET_ERROR(EINVAL)); ASSERT(dvp->v_type == VDIR); /* * If we get a recursive call, that means we got called * from the domount() code while it was trying to look up the * spec (which looks like a local path for zfs). We need to * add some flag to domount() to tell it not to do this lookup. */ if (MUTEX_HELD(&sdp->sd_lock)) return (SET_ERROR(ENOENT)); ZFS_ENTER(zfsvfs); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { ZFS_EXIT(zfsvfs); return (0); } if (flags & FIGNORECASE) { boolean_t conflict = B_FALSE; err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, sizeof (real), &conflict); if (err == 0) { nm = real; } else if (err != ENOTSUP) { ZFS_EXIT(zfsvfs); return (err); } if (realpnp) (void) strlcpy(realpnp->pn_buf, nm, realpnp->pn_bufsize); if (conflict && direntflags) *direntflags = ED_CASE_CONFLICT; } mutex_enter(&sdp->sd_lock); search.se_name = (char *)nm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { *vpp = sep->se_root; VN_HOLD(*vpp); err = traverse(vpp); if (err != 0) { VN_RELE(*vpp); *vpp = NULL; } else if (*vpp == sep->se_root) { /* * The snapshot was unmounted behind our backs, * try to remount it. */ goto domount; } else { /* * VROOT was set during the traverse call. We need * to clear it since we're pretending to be part * of our parent's vfs. */ (*vpp)->v_flag &= ~VROOT; } mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (err); } /* * The requested snapshot is not currently mounted, look it up. */ err = zfsctl_snapshot_zname(dvp, nm, sizeof (snapname), snapname); if (err != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); /* * handle "ls *" or "?" in a graceful manner, * forcing EILSEQ to ENOENT. * Since shell ultimately passes "*" or "?" as name to lookup */ return (err == EILSEQ ? ENOENT : err); } if (dmu_objset_hold(snapname, FTAG, &snap) != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); avl_insert(&sdp->sd_snaps, sep, where); dmu_objset_rele(snap, FTAG); domount: mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + strlen("/.zfs/snapshot/") + strlen(nm) + 1; mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", refstr_value(dvp->v_vfsp->vfs_mntpt), nm); margs.spec = snapname; margs.dir = mountpoint; margs.flags = MS_SYSSPACE | MS_NOMNTTAB; margs.fstype = "zfs"; margs.dataptr = NULL; margs.datalen = 0; margs.optptr = NULL; margs.optlen = 0; err = domount("zfs", &margs, *vpp, kcred, &vfsp); kmem_free(mountpoint, mountpoint_len); if (err == 0) { /* * Return the mounted root rather than the covered mount point. * Takes the GFS vnode at .zfs/snapshot/ and returns * the ZFS vnode mounted on top of the GFS node. This ZFS * vnode is the root of the newly created vfsp. */ VFS_RELE(vfsp); err = traverse(vpp); } if (err == 0) { /* * Fix up the root vnode mounted on .zfs/snapshot/. * * This is where we lie about our v_vfsp in order to * make .zfs/snapshot/ accessible over NFS * without requiring manual mounts of . */ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; (*vpp)->v_vfsp = zfsvfs->z_vfs; (*vpp)->v_flag &= ~VROOT; } mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); /* * If we had an error, drop our hold on the vnode and * zfsctl_snapshot_inactive() will clean up. */ if (err != 0) { VN_RELE(*vpp); *vpp = NULL; } return (err); } /* ARGSUSED */ static int zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp, flags, rdir, cr, ct, direntflags, realpnp); VN_RELE(ZTOV(dzp)); } ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, offset_t *offp, offset_t *nextp, void *data, int flags) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; char snapname[ZFS_MAX_DATASET_NAME_LEN]; uint64_t id, cookie; boolean_t case_conflict; int error; ZFS_ENTER(zfsvfs); cookie = *offp; dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), snapname, &id, &cookie, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error) { ZFS_EXIT(zfsvfs); if (error == ENOENT) { *eofp = 1; return (0); } return (error); } if (flags & V_RDDIR_ENTFLAGS) { edirent_t *eodp = dp; (void) strcpy(eodp->ed_name, snapname); eodp->ed_ino = ZFSCTL_INO_SNAP(id); eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; } else { struct dirent64 *odp = dp; (void) strcpy(odp->d_name, snapname); odp->d_ino = ZFSCTL_INO_SNAP(id); } *nextp = cookie; ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct, int flags) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags); VN_RELE(ZTOV(dzp)); } else { *eofp = 1; error = SET_ERROR(ENOENT); } ZFS_EXIT(zfsvfs); return (error); } /* * pvp is the '.zfs' directory (zfsctl_node_t). * * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). * * This function is the callback to create a GFS vnode for '.zfs/snapshot' * when a lookup is performed on .zfs for "snapshot". */ vnode_t * zfsctl_mknode_snapdir(vnode_t *pvp) { vnode_t *vp; zfsctl_snapdir_t *sdp; vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, zfsctl_snapdir_readdir_cb, NULL); sdp = vp->v_data; sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&sdp->sd_snaps, snapentry_compare, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); return (vp); } vnode_t * zfsctl_mknode_shares(vnode_t *pvp) { vnode_t *vp; zfsctl_node_t *sdp; vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, NULL, NULL); sdp = vp->v_data; sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; return (vp); } /* ARGSUSED */ static int zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct); VN_RELE(ZTOV(dzp)); } ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_snapdir_t *sdp = vp->v_data; ZFS_ENTER(zfsvfs); zfsctl_common_getattr(vp, vap); vap->va_nodeid = gfs_file_inode(vp); vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static void zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { zfsctl_snapdir_t *sdp = vp->v_data; void *private; private = gfs_dir_inactive(vp); if (private != NULL) { ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); mutex_destroy(&sdp->sd_lock); avl_destroy(&sdp->sd_snaps); kmem_free(private, sizeof (zfsctl_snapdir_t)); } } static const fs_operation_def_t zfsctl_tops_snapdir[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_IOCTL, { .error = fs_inval } }, { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { NULL } }; static const fs_operation_def_t zfsctl_tops_shares[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_IOCTL, { .error = fs_inval } }, { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, { NULL } }; /* * pvp is the GFS vnode '.zfs/snapshot'. * * This creates a GFS node under '.zfs/snapshot' representing each * snapshot. This newly created GFS node is what we mount snapshot * vfs_t's ontop of. */ static vnode_t * zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) { vnode_t *vp; zfsctl_node_t *zcp; vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = objset; return (vp); } static void zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; vnode_t *dvp; VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { - vp->v_count--; + VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); mutex_exit(&sdp->sd_lock); VN_RELE(dvp); return; } mutex_exit(&vp->v_lock); ASSERT(!vn_ismntpt(vp)); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { next = AVL_NEXT(&sdp->sd_snaps, sep); if (sep->se_root == vp) { avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); break; } sep = next; } ASSERT(sep != NULL); mutex_exit(&sdp->sd_lock); VN_RELE(dvp); /* * Dispose of the vnode for the snapshot mount point. * This is safe to do because once this entry has been removed * from the AVL tree, it can't be found again, so cannot become * "active". If we lookup the same name again we will end up * creating a new vnode. */ gfs_vop_inactive(vp, cr, ct); } /* * These VP's should never see the light of day. They should always * be covered. */ static const fs_operation_def_t zfsctl_tops_snapshot[] = { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapshot_inactive }, NULL, NULL }; int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp, *vp; zfsctl_snapdir_t *sdp; zfsctl_node_t *zcp; zfs_snapentry_t *sep; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, kcred, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { vp = sep->se_root; zcp = vp->v_data; if (zcp->zc_id == objsetid) break; sep = AVL_NEXT(&sdp->sd_snaps, sep); } if (sep != NULL) { VN_HOLD(vp); /* * Return the mounted root rather than the covered mount point. * Takes the GFS vnode at .zfs/snapshot/ * and returns the ZFS vnode mounted on top of the GFS node. * This ZFS vnode is the root of the vfs for objset 'objsetid'. */ error = traverse(&vp); if (error == 0) { if (vp == sep->se_root) error = SET_ERROR(EINVAL); else *zfsvfsp = VTOZ(vp)->z_zfsvfs; } mutex_exit(&sdp->sd_lock); VN_RELE(vp); } else { error = SET_ERROR(EINVAL); mutex_exit(&sdp->sd_lock); } VN_RELE(dvp); return (error); } /* * Unmount any snapshots for the given filesystem. This is called from * zfs_umount() - if we have a ctldir, then go through and unmount all the * snapshots. */ int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, cr, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { next = AVL_NEXT(&sdp->sd_snaps, sep); /* * If this snapshot is not mounted, then it must * have just been unmounted by somebody else, and * will be cleaned up by zfsctl_snapdir_inactive(). */ if (vn_ismntpt(sep->se_root)) { avl_remove(&sdp->sd_snaps, sep); error = zfsctl_unmount_snap(sep, fflags, cr); if (error) { avl_add(&sdp->sd_snaps, sep); break; } } sep = next; } mutex_exit(&sdp->sd_lock); VN_RELE(dvp); return (error); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c (revision 318932) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c (revision 318933) @@ -1,5383 +1,5383 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2015 Joyent, Inc. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fs/fs_subr.h" #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, int cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (zfs_sync(vp->v_vfsp, 0, cred)); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); return (0); } case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } } return (SET_ERROR(ENOTTY)); } /* * Utility functions to map and unmap a single physical page. These * are used to manage the mappable copies of ZFS file data, and therefore * do not update ref/mod bits. */ caddr_t zfs_map_page(page_t *pp, enum seg_rw rw) { if (kpm_enable) return (hat_kpm_mapin(pp, 0)); ASSERT(rw == S_READ || rw == S_WRITE); return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), (caddr_t)-1)); } void zfs_unmap_page(page_t *pp, caddr_t addr) { if (kpm_enable) { hat_kpm_mapout(pp, 0, addr); } else { ppmapout(addr); } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) { int64_t off; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; uint64_t nbytes = MIN(PAGESIZE - off, len); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; va = zfs_map_page(pp, S_WRITE); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH); zfs_unmap_page(pp, va); page_unlock(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); int64_t start, off; int len = nbytes; int error = 0; start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; va = zfs_map_page(pp, S_READ); error = uiomove(va + off, bytes, UIO_READ, uio); zfs_unmap_page(pp, va); page_unlock(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = uio->uio_llimit; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT(zio != NULL); ASSERT(size != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } /* * If vnode is for a device return a specfs vnode instead. */ static int specvp_check(vnode_t **vpp, cred_t *cr) { int error = 0; if (IS_DEVVP(*vpp)) { struct vnode *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) error = SET_ERROR(ENOSYS); *vpp = svp; } return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *vpp = dvp; VN_HOLD(*vpp); return (0); } return (error); } else if (!zdp->z_zfsvfs->z_norm && (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) { vnode_t *tvp = dnlc_lookup(dvp, nm); if (tvp) { error = zfs_fastaccesschk_execute(zdp, cr); if (error) { VN_RELE(tvp); return (error); } if (tvp == DNLC_NO_VNODE) { VN_RELE(tvp); return (SET_ERROR(ENOENT)); } else { *vpp = tvp; return (specvp_check(vpp, cr)); } } } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { VN_RELE(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (dvp->v_type != VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); if (error == 0) error = specvp_check(vpp, cr); ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, vsecattr_t *vsecp) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *vpp = NULL; if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~VSVTX; if (*name == '\0') { /* * Null component name refers to the directory itself. */ VN_HOLD(dvp); zp = dzp; dl = NULL; error = 0; } else { /* possible VN_HOLD(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dl, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl == EXCL) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ zfs_dirent_unlock(dl); dl = NULL; error = zfs_freesp(zp, 0, 0, mode, TRUE); if (error == 0) { vnevent_create(ZTOV(zp), ct); } } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); error = specvp_check(vpp, cr); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); znode_t *xzp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp)) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); if (realnmp) dnlc_remove(dvp, realnmp->pn_buf); else dnlc_remove(dvp, name); mutex_enter(&vp->v_lock); may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); mutex_exit(&vp->v_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); if (xzp) VN_RELE(ZTOV(xzp)); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; mutex_exit(&vp->v_lock); } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(xzp->z_links, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; xzp->z_links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &xzp->z_links, sizeof (xzp->z_links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } mutex_enter(&vp->v_lock); - vp->v_count--; + VN_RELE_LOCKED(vp); ASSERT0(vp->v_count); mutex_exit(&vp->v_lock); mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); if (!delete_now) VN_RELE(vp); if (xzp) VN_RELE(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *vpp = NULL; if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dl, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(dvp); znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } if (vp == cwd) { error = SET_ERROR(EINVAL); goto out; } vnevent_rmdir(vp, dvp, name, ct); /* * Grab a lock on the directory to make sure that noone is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); VN_RELE(vp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, caller_context_t *ct, int flags) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ if (check_sysattrs && !zap.za_normalization_conflict) { zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { VN_RELE(ZTOV(ezp)); goto skip_entry; } VN_RELE(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; /* NOTE: d_off is the offset for the *next* entry */ next = &(odp->d_off); (void) strncpy(odp->d_name, zap.za_name, DIRENT64_NAMELEN(reclen)); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (next) *next = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Regardless of whether this is required for standards conformance, * this is the logical behavior when fsync() is called on a file with * dirty pages. We use B_ASYNC since the ZIL transactions are already * going to be pushed out as part of the zil_commit(). */ if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && (vp->v_type == VREG) && !(IS_SWAPVP(vp))) (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint64_t links; uint64_t mtime[2], ctime[2]; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[2]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; vap->va_mode = zp->z_mode & MODEMASK; vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) links = zp->z_links + 1; else links = zp->z_links; vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ vap->va_size = zp->z_size; vap->va_rdev = vp->v_rdev; vap->va_seq = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), times, sizeof (times)); ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); mutex_exit(&zp->z_lock); sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (vap->va_size == 0) vnevent_truncate(ZTOV(zp), ct); } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) VN_RELE(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) VN_RELE(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) VN_RELE(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (err == ERESTART) goto top; } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) VN_RELE(ZTOV(zl->zl_znode)); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; zilog_t *zilog; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0, rm_err = 0; int zflg = 0; boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; /* * Make sure we have the real vp for the target directory. */ if (VOP_REALVP(tdvp, &realvp, ct) == 0) tdvp = realvp; tdzp = VTOZ(tdvp); ZFS_VERIFY_ZP(tdzp); /* * We check z_zfsvfs rather than v_vfsp here, because snapshots and the * ctldir appear to have the same v_vfsp. */ if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) VN_RELE(ZTOV(tzp)); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = SET_ERROR(EINVAL); ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); VN_RELE(ZTOV(szp)); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = SET_ERROR(EINVAL); ZFS_EXIT(zfsvfs); return (terr); } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto out; if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (ZTOV(szp)->v_type == VDIR) { if (ZTOV(tzp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } } else { if (ZTOV(tzp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct); if (tzp) vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } } dmu_tx_commit(tx); if (tzp && rm_err == 0) vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); if (error == 0) { vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); /* notify the target dir if it is not the same as source dir */ if (tdvp != sdvp) vnevent_rename_dest_dir(tdvp, ct); } out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); VN_RELE(ZTOV(zp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (VOP_REALVP(svp, &realvp, ct) == 0) svp = realvp; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); /* * We check z_zfsvfs rather than v_vfsp here, because snapshots and the * ctldir appear to have the same v_vfsp. */ if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * zfs_null_putapage() is used when the file system has been force * unmounted. It just drops the pages. */ /* ARGSUSED */ static int zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); return (0); } /* * Push a page out to disk, klustering if possible. * * IN: vp - file to push page to. * pp - page to push. * flags - additional flags. * cr - credentials of caller. * * OUT: offp - start of range pushed. * lenp - len of range pushed. * * RETURN: 0 on success, error code on failure. * * NOTE: callers must have locked the page to be pushed. On * exit, the page (and all other pages in the kluster) must be * unlocked. */ /* ARGSUSED */ static int zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; u_offset_t off, koff; size_t len, klen; int err; off = pp->p_offset; len = PAGESIZE; /* * If our blocksize is bigger than the page size, try to kluster * multiple pages so that we write a full block (thus avoiding * a read-modify-write). */ if (off < zp->z_size && zp->z_blksz > PAGESIZE) { klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= zp->z_size); if (koff + klen > zp->z_size) klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); /* * Can't push pages past end-of-file. */ if (off >= zp->z_size) { /* ignore all pages */ err = 0; goto out; } else if (off + len > zp->z_size) { int npages = btopr(zp->z_size - off); page_t *trunc; page_list_break(&pp, &trunc, npages); /* ignore pages past end of file */ if (trunc) pvn_write_done(trunc, flags); len = zp->z_size - off; } if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { err = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz <= PAGESIZE) { caddr_t va = zfs_map_page(pp, S_READ); ASSERT3U(len, <=, PAGESIZE); dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); zfs_unmap_page(pp, va); } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); out: pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; if (lenp) *lenp = len; return (err); } /* * Copy the portion of the file indicated from pages into the file. * The pages are stored in a page list attached to the files vnode. * * IN: vp - vnode of file to push page data to. * off - position in file to put data. * len - amount of data to write. * flags - flags to control the operation. * cr - credentials of caller. * ct - caller context. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; page_t *pp; size_t io_len; u_offset_t io_off; uint_t blksz; rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * There's nothing to do if no data is cached. */ if (!vn_has_cached_data(vp)) { ZFS_EXIT(zfsvfs); return (0); } /* * Align this request to the file block size in case we kluster. * XXX - this can result in pretty aggresive locking, which can * impact simultanious read/write access. One option might be * to break up long requests (len == 0) into block-by-block * operations to get narrower locking. */ blksz = zp->z_blksz; if (ISP2(blksz)) io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); else io_off = 0; if (len > 0 && ISP2(blksz)) io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); else io_len = 0; if (io_len == 0) { /* * Search the entire vp list for pages >= io_off. */ rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); if (off > zp->z_size) { /* past end of file */ zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); } else { pp = page_lookup_nowait(vp, io_off, (flags & B_FREE) ? SE_EXCL : SE_SHARED); } if (pp != NULL && pvn_getdirty(pp, flags)) { int err; /* * Found a dirty page to push */ err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); if (err) error = err; } else { io_len = PAGESIZE; } } out: zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ if (vn_has_cached_data(vp)) { (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, B_INVAL, cr); } mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); ASSERT(vp->v_count == 1); - vp->v_count = 0; + VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); mutex_exit(&zp->z_lock); rw_exit(&zfsvfs->z_teardown_inactive_lock); zfs_znode_free(zp); return; } /* * Attempt to push any data in the page cache. If this fails * we will get kicked out later in zfs_zinactive(). */ if (vn_has_cached_data(vp)) { (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, cr); } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Bounds-check the seek operation. * * IN: vp - vnode seeking within * ooff - old file offset * noffp - pointer to new file offset * ct - caller context * * RETURN: 0 on success, EINVAL if new offset invalid. */ /* ARGSUSED */ static int zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) { if (vp->v_type == VDIR) return (0); return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); } /* * Pre-filter the generic locking function to trap attempts to place * a mandatory lock on a memory mapped file. */ static int zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * We are following the UFS semantics with respect to mapcnt * here: If we see that the file is mapped already, then we will * return an error, but we don't worry about races between this * function and zfs_map(). */ if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EAGAIN)); } ZFS_EXIT(zfsvfs); return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); } /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill * multiple pages at once (klustering) to fill up the supplied page * list. Note that the pages to be filled are held with an exclusive * lock to prevent access by other threads while they are being filled. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) { znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; u_offset_t io_off, total; size_t io_len; int err; if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { /* * We only have a single page, don't bother klustering */ io_off = off; io_len = PAGESIZE; pp = page_create_va(vp, io_off, io_len, PG_EXCL | PG_WAIT, seg, addr); } else { /* * Try to find enough pages to fill the page list */ pp = pvn_read_kluster(vp, off, seg, addr, &io_off, &io_len, off, plsz, 0); } if (pp == NULL) { /* * The page already exists, nothing to do here. */ *pl = NULL; return (0); } /* * Fill the pages in the kluster. */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ pvn_read_done(pp, B_ERROR); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } cur_pp = cur_pp->p_next; } /* * Fill in the page list array from the kluster starting * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may * also return page pointers from after the specified region * (i.e. the region [off, off + plsz]). These additional pages are * only returned if they are already in the cache, or were created as * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. * len - amount of data to retrieve. * plsz - length of provided page list. * seg - segment to obtain pages for. * addr - virtual address of fault. * rw - mode of created pages. * cr - credentials of caller. * ct - caller context. * * OUT: protp - protection mode of created pages. * pl - list of pages created. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; page_t **pl0 = pl; int err = 0; /* we do our own caching, faultahead is unnecessary */ if (pl == NULL) return (0); else if (len > plsz) len = plsz; else len = P2ROUNDUP(len, PAGESIZE); ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (protp) *protp = PROT_ALL; /* * Loop through the requested range [off, off + len) looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { if (*pl = page_lookup(vp, off, SE_SHARED)) *(pl+1) = NULL; else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) goto out; while (*pl) { ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; if (len > 0) { ASSERT3U(len, >=, PAGESIZE); len -= PAGESIZE; } ASSERT3U(plsz, >=, PAGESIZE); plsz -= PAGESIZE; pl++; } } /* * Fill out the page array with any pages already in the cache. */ while (plsz > 0 && (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { off += PAGESIZE; plsz -= PAGESIZE; } out: if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); } else { ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; ZFS_EXIT(zfsvfs); return (err); } /* * Request a memory map for a section of a file. This code interacts * with common code and the VM system as follows: * * - common code calls mmap(), which ends up in smmap_common() * - this calls VOP_MAP(), which takes you into (say) zfs * - zfs_map() calls as_map(), passing segvn_create() as the callback * - segvn_create() creates the new segment and calls VOP_ADDMAP() * - zfs_addmap() updates z_mapcnt */ /*ARGSUSED*/ static int zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; segvn_crargs_t vn_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((prot & PROT_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((prot & (PROT_READ | PROT_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } if (vp->v_flag & VNOMAP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSYS)); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENXIO)); } if (vp->v_type != VREG) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENODEV)); } /* * If file is locked, disallow mapping. */ if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EAGAIN)); } as_rangelock(as); error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); if (error != 0) { as_rangeunlock(as); ZFS_EXIT(zfsvfs); return (error); } vn_a.vp = vp; vn_a.offset = (u_offset_t)off; vn_a.type = flags & MAP_TYPE; vn_a.prot = prot; vn_a.maxprot = maxprot; vn_a.cred = cr; vn_a.amp = NULL; vn_a.flags = flags & ~MAP_TYPE; vn_a.szc = 0; vn_a.lgrp_mem_policy_flags = 0; error = as_map(as, *addrp, len, segvn_create, &vn_a); as_rangeunlock(as); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { uint64_t pages = btopr(len); atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); return (0); } /* * The reason we push dirty pages as part of zfs_delmap() is so that we get a * more accurate mtime for the associated file. Since we don't have a way of * detecting when the data was actually modified, we have to resort to * heuristics. If an explicit msync() is done, then we mark the mtime when the * last page is pushed. The problem occurs when the msync() call is omitted, * which by far the most common case: * * open() * mmap() * * munmap() * close() *