Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c (revision 299945) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c (revision 299946) @@ -1,1256 +1,1246 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Portions Copyright 2007 Shivakumar GN */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Generic pseudo-filesystem routines. * * There are significant similarities between the implementation of certain file * system entry points across different filesystems. While one could attempt to * "choke up on the bat" and incorporate common functionality into a VOP * preamble or postamble, such an approach is limited in the benefit it can * provide. In this file we instead define a toolkit of routines which can be * called from a filesystem (with in-kernel pseudo-filesystems being the focus * of the exercise) in a more component-like fashion. * * There are three basic classes of routines: * * 1) Lowlevel support routines * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, * without forcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() * gfs_readdir_emit() * gfs_readdir_emitn() * gfs_readdir_pred() * gfs_readdir_fini() * gfs_lookup_dot() * * 2) Complete GFS management * * These routines take a more active role in management of the * pseudo-filesystem. They handle the relationship between vnode private * data and VFS data, as well as the relationship between vnodes in the * directory hierarchy. * * In order to use these interfaces, the first member of every private * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control * to GFS. * * gfs_file_create() * gfs_dir_create() * gfs_root_create() * * gfs_file_inactive() * gfs_dir_inactive() * gfs_dir_lookup() * gfs_dir_readdir() * * gfs_vop_reclaim() * gfs_vop_lookup() * gfs_vop_readdir() * gfs_vop_map() * * 3) Single File pseudo-filesystems * * This routine creates a rooted file to be overlayed ontop of another * file in the physical filespace. * * Note that the parent is NULL (actually the vfs), but there is nothing * technically keeping such a file from utilizing the "Complete GFS * management" set of routines. * * gfs_root_create_file() */ #ifdef illumos /* * gfs_make_opsvec: take an array of vnode type definitions and create * their vnodeops_t structures * * This routine takes an array of gfs_opsvec_t's. It could * alternatively take an array of gfs_opsvec_t*'s, which would allow * vnode types to be completely defined in files external to the caller * of gfs_make_opsvec(). As it stands, much more sharing takes place -- * both the caller and the vnode type provider need to access gfsv_ops * and gfsv_template, and the caller also needs to know gfsv_name. */ int gfs_make_opsvec(gfs_opsvec_t *vec) { int error, i; for (i = 0; ; i++) { if (vec[i].gfsv_name == NULL) return (0); error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, vec[i].gfsv_ops); if (error) break; } cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", vec[i].gfsv_name); for (i--; i >= 0; i--) { vn_freevnodeops(*vec[i].gfsv_ops); *vec[i].gfsv_ops = NULL; } return (error); } #endif /* illumos */ /* * Low level directory routines * * These routines provide some simple abstractions for reading directories. * They are designed to be used by existing pseudo filesystems (namely procfs) * that already have a complicated management infrastructure. */ /* * gfs_get_parent_ino: used to obtain a parent inode number and the * inode number of the given vnode in preparation for calling gfs_readdir_init. */ int gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, ino64_t *pino, ino64_t *ino) { vnode_t *parent; gfs_dir_t *dp = dvp->v_data; int error; *ino = dp->gfsd_file.gfs_ino; parent = dp->gfsd_file.gfs_parent; if (parent == NULL) { *pino = *ino; /* root of filesystem */ } else if (dvp->v_flag & V_XATTRDIR) { #ifdef TODO vattr_t va; va.va_mask = AT_NODEID; error = VOP_GETATTR(parent, &va, 0, cr, ct); if (error) return (error); *pino = va.va_nodeid; #else panic("%s:%u: not implemented", __func__, __LINE__); #endif } else { *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; } return (0); } /* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length * ureclen - the exported file-space record length (1 for non-legacy FSs) * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode * flags - flags from VOP_READDIR * * Returns 0 or a non-zero errno. * * Typical VOP_READDIR usage of gfs_readdir_*: * * if ((error = gfs_readdir_init(...)) != 0) * return (error); * eof = 0; * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { * if (!consumer_entry_at(voffset)) * voffset = consumer_next_entry(voffset); * if (consumer_eof(voffset)) { * eof = 1 * break; * } * if ((error = gfs_readdir_emit(..., voffset, * consumer_ino(voffset), consumer_name(voffset))) != 0) * break; * } * return (gfs_readdir_fini(..., error, eofp, eof)); * * As you can see, a zero result from gfs_readdir_pred() or * gfs_readdir_emit() indicates that processing should continue, * whereas a non-zero result indicates that the loop should terminate. * Most consumers need do nothing more than let gfs_readdir_fini() * determine what the cause of failure was and return the appropriate * value. */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, uio_t *uiop, ino64_t parent, ino64_t self, int flags) { size_t dirent_size; if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; if (flags & V_RDDIR_ENTFLAGS) dirent_size = EDIRENT_RECLEN(st->grd_namlen); else dirent_size = DIRENT64_RECLEN(st->grd_namlen); st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); st->grd_parent = parent; st->grd_self = self; st->grd_flags = flags; return (0); } /* * gfs_readdir_emit_int: internal routine to emit directory entry * * st - the current readdir state, which must have d_ino/ed_ino * and d_name/ed_name set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ static int gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, int *ncookies, u_long **cookies) { int reclen, namlen; dirent64_t *dp; edirent_t *edp; if (st->grd_flags & V_RDDIR_ENTFLAGS) { edp = st->grd_dirent; namlen = strlen(edp->ed_name); reclen = EDIRENT_RECLEN(namlen); } else { dp = st->grd_dirent; namlen = strlen(dp->d_name); reclen = DIRENT64_RECLEN(namlen); } if (reclen > uiop->uio_resid) { /* * Error if no entries were returned yet */ if (uiop->uio_resid == st->grd_oresid) return (EINVAL); return (-1); } if (st->grd_flags & V_RDDIR_ENTFLAGS) { edp->ed_off = next; edp->ed_reclen = (ushort_t)reclen; } else { /* XXX: This can change in the future. */ dp->d_reclen = (ushort_t)reclen; dp->d_type = DT_DIR; dp->d_namlen = namlen; } if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); uiop->uio_loffset = next; if (*cookies != NULL) { **cookies = next; (*cookies)++; (*ncookies)--; KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies)); } return (0); } /* * gfs_readdir_emit: emit a directory entry * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name * eflags - value for ed_eflags (if processing edirent_t) * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies) { offset_t off = (voff + 2) * st->grd_ureclen; if (st->grd_flags & V_RDDIR_ENTFLAGS) { edirent_t *edp = st->grd_dirent; edp->ed_ino = ino; (void) strncpy(edp->ed_name, name, st->grd_namlen); edp->ed_eflags = eflags; } else { dirent64_t *dp = st->grd_dirent; dp->d_ino = ino; (void) strncpy(dp->d_name, name, st->grd_namlen); } /* * Inter-entry offsets are invalid, so we assume a record size of * grd_ureclen and explicitly set the offset appropriately. */ return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies, cookies)); } #ifdef illumos /* * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer * instead of a string for the entry's name. */ int gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, ino64_t ino, unsigned long num) { char buf[40]; numtos(num, buf); return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); } #endif /* * gfs_readdir_pred: readdir loop predicate * voffp - a pointer in which the next virtual offset should be stored * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or * -1) from this function is typically passed directly to * gfs_readdir_fini(). */ int gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp, int *ncookies, u_long **cookies) { offset_t off, voff; int error; top: if (uiop->uio_resid <= 0) return (-1); off = uiop->uio_loffset / st->grd_ureclen; voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, ".", 0, ncookies, cookies)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, "..", 0, ncookies, cookies)) == 0) goto top; } else { *voffp = voff; return (0); } return (error); } /* * gfs_readdir_fini: generic readdir cleanup * error - if positive, an error to return * eofp - the eofp passed to readdir * eof - the eof value * * Returns a 0 on success, a non-zero errno on failure. This result * should be returned from readdir. */ int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { size_t dirent_size; if (st->grd_flags & V_RDDIR_ENTFLAGS) dirent_size = EDIRENT_RECLEN(st->grd_namlen); else dirent_size = DIRENT64_RECLEN(st->grd_namlen); kmem_free(st->grd_dirent, dirent_size); if (error > 0) return (error); if (eofp) *eofp = eof; return (0); } /* * gfs_lookup_dot * * Performs a basic check for "." and ".." directory entries. */ int gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) { int ltype; if (*nm == '\0' || strcmp(nm, ".") == 0) { VN_HOLD(dvp); *vpp = dvp; return (0); } else if (strcmp(nm, "..") == 0) { - if (pvp == NULL) { - ASSERT(dvp->v_flag & VROOT); - VN_HOLD(dvp); - *vpp = dvp; - ASSERT_VOP_ELOCKED(dvp, "gfs_lookup_dot: non-locked dvp"); - } else { - ltype = VOP_ISLOCKED(dvp); - VOP_UNLOCK(dvp, 0); - VN_HOLD(pvp); - *vpp = pvp; - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - vn_lock(dvp, ltype | LK_RETRY); - } + ASSERT(pvp != NULL); + VN_HOLD(pvp); + *vpp = pvp; return (0); } return (-1); } /* * gfs_file_create(): create a new GFS file * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * * In order to use this interface, the parent vnode must have been created by * gfs_dir_create(), and the private data stored in v_data must have a * 'gfs_file_t' as its first field. * * Given these constraints, this routine will automatically: * * - Allocate v_data for the vnode * - Initialize necessary fields in the vnode * - Hold the parent */ vnode_t * gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops) { gfs_file_t *fp; vnode_t *vp; int error; /* * Allocate vnode and internal data structure */ fp = kmem_zalloc(size, KM_SLEEP); error = getnewvnode("zfs_gfs", vfsp, ops, &vp); ASSERT(error == 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_data = (caddr_t)fp; /* * Set up various pointers */ fp->gfs_vnode = vp; fp->gfs_parent = pvp; fp->gfs_size = size; fp->gfs_type = GFS_FILE; vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, vfsp); vp->v_vflag &= ~VV_FORCEINSMQ; KASSERT(error == 0, ("insmntque() failed: error %d", error)); /* * Initialize vnode and hold parent. */ if (pvp) VN_HOLD(pvp); return (vp); } /* * gfs_dir_create: creates a new directory in the parent * * size - size of private data structure (v_data) * pvp - parent vnode (GFS directory) * ops - vnode operations vector * entries - NULL-terminated list of static entries (if any) * maxlen - maximum length of a directory entry * readdir_cb - readdir callback (see gfs_dir_readdir) * inode_cb - inode callback (see gfs_dir_readdir) * lookup_cb - lookup callback (see gfs_dir_lookup) * * In order to use this function, the first member of the private vnode * structure (v_data) must be a gfs_dir_t. For each directory, there are * static entries, defined when the structure is initialized, and dynamic * entries, retrieved through callbacks. * * If a directory has static entries, then it must supply a inode callback, * which will compute the inode number based on the parent and the index. * For a directory with dynamic entries, the caller must supply a readdir * callback and a lookup callback. If a static lookup fails, we fall back to * the supplied lookup callback, if any. * * This function also performs the same initialization as gfs_file_create(). */ vnode_t * gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; gfs_dir_t *dp; gfs_dirent_t *de; vp = gfs_file_create(struct_size, pvp, vfsp, ops); vp->v_type = VDIR; dp = vp->v_data; dp->gfsd_file.gfs_type = GFS_DIR; dp->gfsd_maxlen = maxlen; if (entries != NULL) { for (de = entries; de->gfse_name != NULL; de++) dp->gfsd_nstatic++; dp->gfsd_static = kmem_alloc( dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); bcopy(entries, dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } dp->gfsd_readdir = readdir_cb; dp->gfsd_lookup = lookup_cb; dp->gfsd_inode = inode_cb; mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); return (vp); } /* * gfs_root_create(): create a root vnode for a GFS filesystem * * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The * only difference is that it takes a vfs_t instead of a vnode_t as its parent. */ vnode_t * gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) { vnode_t *vp; #ifdef illumos VFS_HOLD(vfsp); #endif vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb, maxlen, readdir_cb, lookup_cb); /* Manually set the inode */ ((gfs_file_t *)vp->v_data)->gfs_ino = ino; vp->v_flag |= VROOT; return (vp); } #ifdef illumos /* * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem * * Similar to gfs_root_create(), this creates a root vnode for a file to * be the pseudo-filesystem. */ vnode_t * gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) { vnode_t *vp = gfs_file_create(size, NULL, ops); ((gfs_file_t *)vp->v_data)->gfs_ino = ino; VFS_HOLD(vfsp); VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; return (vp); } #endif /* illumos */ /* * gfs_file_inactive() * * Called from the VOP_RECLAIM() routine. If necessary, this routine will * remove the given vnode from the parent directory and clean up any references * in the VFS layer. * * If the vnode was not removed (due to a race with vget), then NULL is * returned. Otherwise, a pointer to the private data is returned. */ void * gfs_file_inactive(vnode_t *vp) { int i; gfs_dirent_t *ge = NULL; gfs_file_t *fp = vp->v_data; gfs_dir_t *dp = NULL; void *data; if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) goto found; /* * XXX cope with a FreeBSD-specific race wherein the parent's * snapshot data can be freed before the parent is */ if ((dp = fp->gfs_parent->v_data) == NULL) return (NULL); /* * First, see if this vnode is cached in the parent. */ gfs_dir_lock(dp); /* * Find it in the set of static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (ge->gfse_vnode == vp) goto found; } /* * If 'ge' is NULL, then it is a dynamic entry. */ ge = NULL; found: #ifdef TODO if (vp->v_flag & V_XATTRDIR) VI_LOCK(fp->gfs_parent); #endif VI_LOCK(vp); /* * Really remove this vnode */ data = vp->v_data; if (ge != NULL) { /* * If this was a statically cached entry, simply set the * cached vnode to NULL. */ ge->gfse_vnode = NULL; } VI_UNLOCK(vp); /* * Free vnode and release parent */ if (fp->gfs_parent) { if (dp) gfs_dir_unlock(dp); VOP_UNLOCK(vp, 0); VN_RELE(fp->gfs_parent); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { ASSERT(vp->v_vfsp != NULL); #ifdef illumos VFS_RELE(vp->v_vfsp); #endif } #ifdef TODO if (vp->v_flag & V_XATTRDIR) VI_UNLOCK(fp->gfs_parent); #endif return (data); } /* * gfs_dir_inactive() * * Same as above, but for directories. */ void * gfs_dir_inactive(vnode_t *vp) { gfs_dir_t *dp; ASSERT(vp->v_type == VDIR); if ((dp = gfs_file_inactive(vp)) != NULL) { mutex_destroy(&dp->gfsd_lock); if (dp->gfsd_nstatic) kmem_free(dp->gfsd_static, dp->gfsd_nstatic * sizeof (gfs_dirent_t)); } return (dp); } /* * gfs_dir_lookup_dynamic() * * This routine looks up the provided name amongst the dynamic entries * in the gfs directory and returns the corresponding vnode, if found. * * The gfs directory is expected to be locked by the caller prior to * calling this function. The directory will be unlocked during the * execution of this function, but will be locked upon return from the * function. This function returns 0 on success, non-zero on error. * * The dynamic lookups are performed by invoking the lookup * callback, which is passed to this function as the first argument. * The arguments to the callback are: * * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, * int flags, int *deflgs, pathname_t *rpnp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode * cr - pointer to cred * flags - flags value from lookup request * ignored here; currently only used to request * insensitive lookups * direntflgs - output parameter, directory entry flags * ignored here; currently only used to indicate a lookup * has more than one possible match when case is not considered * realpnp - output parameter, real pathname * ignored here; when lookup was performed case-insensitively, * this field contains the "real" name of the file. * * Returns 0 on success, non-zero on error. */ static int gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, int *direntflags, pathname_t *realpnp) { gfs_file_t *fp; ino64_t ino; int ret; ASSERT(GFS_DIR_LOCKED(dp)); /* * Drop the directory lock, as the lookup routine * will need to allocate memory, or otherwise deadlock on this * directory. */ gfs_dir_unlock(dp); ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); gfs_dir_lock(dp); /* * The callback for extended attributes returns a vnode * with v_data from an underlying fs. */ if (ret == 0 && !IS_XATTRDIR(dvp)) { fp = (gfs_file_t *)((*vpp)->v_data); fp->gfs_index = -1; fp->gfs_ino = ino; } return (ret); } /* * gfs_dir_lookup_static() * * This routine looks up the provided name amongst the static entries * in the gfs directory and returns the corresponding vnode, if found. * The first argument to the function is a pointer to the comparison * function this function should use to decide if names are a match. * * If a match is found, and GFS_CACHE_VNODE is set and the vnode * exists, we simply return the existing vnode. Otherwise, we call * the static entry's callback routine, caching the result if * necessary. If the idx pointer argument is non-NULL, we use it to * return the index of the matching static entry. * * The gfs directory is expected to be locked by the caller prior to calling * this function. The directory may be unlocked during the execution of * this function, but will be locked upon return from the function. * * This function returns 0 if a match is found, ENOENT if not. */ static int gfs_dir_lookup_static(int (*compare)(const char *, const char *), gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, vnode_t **vpp, pathname_t *rpnp) { gfs_dirent_t *ge; vnode_t *vp = NULL; int i; ASSERT(GFS_DIR_LOCKED(dp)); /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (compare(ge->gfse_name, nm) == 0) { if (rpnp) (void) strlcpy(rpnp->pn_buf, ge->gfse_name, rpnp->pn_bufsize); if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); break; } /* * We drop the directory lock, as the constructor will * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set * for this entry, we discard the result in favor of * the cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); gfs_dir_lock(dp); ((gfs_file_t *)vp->v_data)->gfs_index = i; /* Set the inode according to the callback. */ ((gfs_file_t *)vp->v_data)->gfs_ino = dp->gfsd_inode(dvp, i); if (ge->gfse_flags & GFS_CACHE_VNODE) { if (ge->gfse_vnode == NULL) { ge->gfse_vnode = vp; } else { /* * A parallel constructor beat us to it; * return existing vnode. We have to be * careful because we can't release the * current vnode while holding the * directory lock; its inactive routine * will try to lock this directory. */ vnode_t *oldvp = vp; vp = ge->gfse_vnode; VN_HOLD(vp); gfs_dir_unlock(dp); VN_RELE(oldvp); gfs_dir_lock(dp); } } break; } } if (vp == NULL) return (ENOENT); else if (idx) *idx = i; *vpp = vp; return (0); } /* * gfs_dir_lookup() * * Looks up the given name in the directory and returns the corresponding * vnode, if found. * * First, we search statically defined entries, if any, with a call to * gfs_dir_lookup_static(). If no static entry is found, and we have * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). * * This function returns 0 on success, non-zero on error. */ int gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, int flags, int *direntflags, pathname_t *realpnp) { gfs_dir_t *dp = dvp->v_data; boolean_t casecheck; vnode_t *dynvp = NULL; vnode_t *vp = NULL; int (*compare)(const char *, const char *); int error, idx; ASSERT(dvp->v_type == VDIR); if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) return (0); casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || (flags & FIGNORECASE)) compare = strcasecmp; else compare = strcmp; gfs_dir_lock(dp); error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); if (vp && casecheck) { gfs_dirent_t *ge; int i; for (i = idx + 1; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; if (strcasecmp(ge->gfse_name, nm) == 0) { *direntflags |= ED_CASE_CONFLICT; goto out; } } } if ((error || casecheck) && dp->gfsd_lookup) error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); if (vp && dynvp) { /* static and dynamic entries are case-insensitive conflict */ ASSERT(casecheck); *direntflags |= ED_CASE_CONFLICT; VN_RELE(dynvp); } else if (vp == NULL) { vp = dynvp; } else if (error == ENOENT) { error = 0; } else if (error) { VN_RELE(vp); vp = NULL; } out: gfs_dir_unlock(dp); *vpp = vp; return (error); } /* * gfs_dir_readdir: does a readdir() on the given directory * * dvp - directory vnode * uiop - uio structure * eofp - eof pointer * data - arbitrary data passed to readdir callback * * This routine does all the readdir() dirty work. Even so, the caller must * supply two callbacks in order to get full compatibility. * * If the directory contains static entries, an inode callback must be * specified. This avoids having to create every vnode and call VOP_GETATTR() * when reading the directory. This function has the following arguments: * * ino_t gfs_inode_cb(vnode_t *vp, int index); * * vp - vnode for the directory * index - index in original gfs_dirent_t array * * Returns the inode number for the given entry. * * For directories with dynamic entries, a readdir callback must be provided. * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, * offset_t *off, offset_t *nextoff, void *data, int flags) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS * is set in 'flags'. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left * untouched. * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data * flags - VOP_READDIR flags * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, u_long **cookies, void *data, cred_t *cr, int flags) { gfs_readdir_state_t gstate; int error, eof = 0; ino64_t ino, pino; offset_t off, next; gfs_dir_t *dp = dvp->v_data; error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino); if (error) return (error); if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, pino, ino, flags)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, cookies)) == 0 && !eof) { if (off >= 0 && off < dp->gfsd_nstatic) { ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, off, ino, dp->gfsd_static[off].gfse_name, 0, ncookies, cookies)) != 0) break; } else if (dp->gfsd_readdir) { off -= dp->gfsd_nstatic; if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, data, flags)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; next += dp->gfsd_nstatic + 2; if ((error = gfs_readdir_emit_int(&gstate, uiop, next, ncookies, cookies)) != 0) break; } else { /* * Offset is beyond the end of the static entries, and * we have no dynamic entries. Set EOF. */ eof = 1; } } return (gfs_readdir_fini(&gstate, error, eofp, eof)); } /* * gfs_vop_lookup: VOP_LOOKUP() entry point * * For use directly in vnode ops table. Given a GFS directory, calls * gfs_dir_lookup() as necessary. */ /* ARGSUSED */ int gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); } /* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls * gfs_dir_readdir() as necessary. */ /* ARGSUSED */ int gfs_vop_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; } */ *ap; { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; cred_t *cr = ap->a_cred; int *eofp = ap->a_eofflag; int ncookies = 0; u_long *cookies = NULL; int error; if (ap->a_ncookies) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL, cr, 0); if (error == 0) { /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; } else if (ap->a_ncookies) { free(*ap->a_cookies, M_TEMP); *ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } #ifdef illumos /* * gfs_vop_map: VOP_MAP() entry point * * Convenient routine for handling pseudo-files that wish to allow mmap() calls. * This function only works for readonly files, and uses the read function for * the vnode to fill in the data. The mapped data is immediately faulted in and * filled with the necessary data during this call; there are no getpage() or * putpage() routines. */ /* ARGSUSED */ int gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, caller_context_t *ct) { int rv; ssize_t resid = len; /* * Check for bad parameters */ #ifdef _ILP32 if (len > MAXOFF_T) return (ENOMEM); #endif if (vp->v_flag & VNOMAP) return (ENOTSUP); if (off > MAXOFF_T) return (EFBIG); if ((long)off < 0 || (long)(off + len) < 0) return (EINVAL); if (vp->v_type != VREG) return (ENODEV); if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) return (EACCES); /* * Find appropriate address if needed, otherwise clear address range. */ as_rangelock(as); rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); if (rv != 0) { as_rangeunlock(as); return (rv); } /* * Create mapping */ rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); as_rangeunlock(as); if (rv != 0) return (rv); /* * Fill with data from read() */ rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, 0, (rlim64_t)0, cred, &resid); if (rv == 0 && resid != 0) rv = ENXIO; if (rv != 0) { as_rangelock(as); (void) as_unmap(as, *addrp, len); as_rangeunlock(as); } return (rv); } #endif /* illumos */ /* * gfs_vop_reclaim: VOP_RECLAIM() entry point (solaris' VOP_INACTIVE()) * * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. */ /* ARGSUSED */ int gfs_vop_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; gfs_file_t *fp = vp->v_data; if (fp->gfs_type == GFS_DIR) gfs_dir_inactive(vp); else gfs_file_inactive(vp); vnode_destroy_vobject(vp); VI_LOCK(vp); vp->v_data = NULL; VI_UNLOCK(vp); kmem_free(fp, fp->gfs_size); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c (revision 299945) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c (revision 299946) @@ -1,1699 +1,1708 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. */ /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' directory, but this may expand in the * future. The elements are built using the GFS primitives, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding vnode. * * All mounts are handled automatically by the kernel, but unmounts are * (currently) handled from user land. The main reason is that there is no * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. * * The '.zfs', '.zfs/snapshot', and all directories created under * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and * share the same vfs_t as the head filesystem (what '.zfs' lives under). * * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. * However, vnodes within these mounted on file systems have their v_vfsp * fields set to the head filesystem to make NFS happy (see * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t * so that it cannot be freed until all snapshots have been unmounted. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" typedef struct zfsctl_node { gfs_dir_t zc_gfs_private; uint64_t zc_id; timestruc_t zc_cmtime; /* ctime and mtime, always the same */ } zfsctl_node_t; typedef struct zfsctl_snapdir { zfsctl_node_t sd_node; kmutex_t sd_lock; avl_tree_t sd_snaps; } zfsctl_snapdir_t; typedef struct { char *se_name; vnode_t *se_root; avl_node_t se_node; } zfs_snapentry_t; static int snapentry_compare(const void *a, const void *b) { const zfs_snapentry_t *sa = a; const zfs_snapentry_t *sb = b; int ret = strcmp(sa->se_name, sb->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } #ifdef illumos vnodeops_t *zfsctl_ops_root; vnodeops_t *zfsctl_ops_snapdir; vnodeops_t *zfsctl_ops_snapshot; vnodeops_t *zfsctl_ops_shares; vnodeops_t *zfsctl_ops_shares_dir; static const fs_operation_def_t zfsctl_tops_root[]; static const fs_operation_def_t zfsctl_tops_snapdir[]; static const fs_operation_def_t zfsctl_tops_snapshot[]; static const fs_operation_def_t zfsctl_tops_shares[]; #else static struct vop_vector zfsctl_ops_root; static struct vop_vector zfsctl_ops_snapdir; static struct vop_vector zfsctl_ops_snapshot; static struct vop_vector zfsctl_ops_shares; static struct vop_vector zfsctl_ops_shares_dir; #endif static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_mknode_shares(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); #ifdef illumos static gfs_opsvec_t zfsctl_opsvec[] = { { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir }, { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares }, { NULL } }; #endif /* * Root directory elements. We only have two entries * snapshot and shares. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, { NULL } }; /* include . and .. in the calculation */ #define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ sizeof (gfs_dirent_t)) + 1) /* * Initialize the various GFS pieces we'll need to create and manipulate .zfs * directories. This is called from the ZFS init routine, and initializes the * vnode ops vectors that we'll be using. */ void zfsctl_init(void) { #ifdef illumos VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); #endif } void zfsctl_fini(void) { #ifdef illumos /* * Remove vfsctl vnode ops */ if (zfsctl_ops_root) vn_freevnodeops(zfsctl_ops_root); if (zfsctl_ops_snapdir) vn_freevnodeops(zfsctl_ops_snapdir); if (zfsctl_ops_snapshot) vn_freevnodeops(zfsctl_ops_snapshot); if (zfsctl_ops_shares) vn_freevnodeops(zfsctl_ops_shares); if (zfsctl_ops_shares_dir) vn_freevnodeops(zfsctl_ops_shares_dir); zfsctl_ops_root = NULL; zfsctl_ops_snapdir = NULL; zfsctl_ops_snapshot = NULL; zfsctl_ops_shares = NULL; zfsctl_ops_shares_dir = NULL; #endif /* illumos */ } boolean_t zfsctl_is_node(vnode_t *vp) { return (vn_matchops(vp, zfsctl_ops_root) || vn_matchops(vp, zfsctl_ops_snapdir) || vn_matchops(vp, zfsctl_ops_snapshot) || vn_matchops(vp, zfsctl_ops_shares) || vn_matchops(vp, zfsctl_ops_shares_dir)); } /* * Return the inode number associated with the 'snapshot' or * 'shares' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; ASSERT(index <= 2); if (index == 0) return (ZFSCTL_INO_SNAPDIR); return (zfsvfs->z_shares_dir); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the vfs_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. */ void zfsctl_create(zfsvfs_t *zfsvfs) { vnode_t *vp, *rvp; zfsctl_node_t *zcp; uint64_t crtime[2]; ASSERT(zfsvfs->z_ctldir == NULL); vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = ZFSCTL_INO_ROOT; VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), &crtime, sizeof (crtime))); ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime); VN_URELE(rvp); /* * We're only faking the fact that we have a root of a filesystem for * the sake of the GFS interfaces. Undo the flag manipulation it did * for us. */ vp->v_vflag &= ~VV_ROOT; zfsvfs->z_ctldir = vp; VOP_UNLOCK(vp, 0); } /* * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. * There might still be more references if we were force unmounted, but only * new zfs_inactive() calls can occur and they don't reference .zfs */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { VN_RELE(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ vnode_t * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); VN_HOLD(zp->z_zfsvfs->z_ctldir); return (zp->z_zfsvfs->z_ctldir); } /* * Common open routine. Disallow any write access. */ /* ARGSUSED */ static int zfsctl_common_open(struct vop_open_args *ap) { int flags = ap->a_mode; if (flags & FWRITE) return (SET_ERROR(EACCES)); return (0); } /* * Common close routine. Nothing to do here. */ /* ARGSUSED */ static int zfsctl_common_close(struct vop_close_args *ap) { return (0); } /* * Common access routine. Disallow writes. */ /* ARGSUSED */ static int zfsctl_common_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { accmode_t accmode = ap->a_accmode; #ifdef TODO if (flags & V_ACE_MASK) { if (accmode & ACE_ALL_WRITE_PERMS) return (SET_ERROR(EACCES)); } else { #endif if (accmode & VWRITE) return (SET_ERROR(EACCES)); #ifdef TODO } #endif return (0); } /* * Common getattr function. Fill in basic information. */ static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { timestruc_t now; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* * We are a purely virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; vap->va_nblocks = 0; vap->va_seq = 0; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; vap->va_type = VDIR; /* * We live in the now (for atime). */ gethrestime(&now); vap->va_atime = now; /* FreeBSD: Reset chflags(2) flags. */ vap->va_flags = 0; } /*ARGSUSED*/ static int zfsctl_common_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { vnode_t *vp = ap->a_vp; fid_t *fidp = (void *)ap->a_fid; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_node_t *zcp = vp->v_data; uint64_t object = zcp->zc_id; zfid_short_t *zfid; int i; ZFS_ENTER(zfsvfs); #ifdef illumos if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = SHORT_FID_LEN; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ static int zfsctl_shares_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { vnode_t *vp = ap->a_vp; fid_t *fidp = (void *)ap->a_fid; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_FID(ZTOV(dzp), fidp); VN_RELE(ZTOV(dzp)); } ZFS_EXIT(zfsvfs); return (error); } static int zfsctl_common_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; /* * Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); VI_LOCK(vp); vp->v_data = NULL; VI_UNLOCK(vp); return (0); } /* * .zfs inode namespace * * We need to generate unique inode numbers for all files and directories * within the .zfs pseudo-filesystem. We use the following scheme: * * ENTRY ZFSCTL_INODE * .zfs 1 * .zfs/snapshot 2 * .zfs/snapshot/ objectid(snap) */ #define ZFSCTL_INO_SNAP(id) (id) /* * Get root directory attributes. */ /* ARGSUSED */ static int zfsctl_root_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_node_t *zcp = vp->v_data; ZFS_ENTER(zfsvfs); vap->va_nodeid = ZFSCTL_INO_ROOT; vap->va_nlink = vap->va_size = NROOT_ENTRIES; vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; vap->va_birthtime = vap->va_ctime; zfsctl_common_getattr(vp, vap); ZFS_EXIT(zfsvfs); return (0); } /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; /* * No extended attributes allowed under .zfs */ if (flags & LOOKUP_XATTR) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); if (strcmp(nm, "..") == 0) { err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp); if (err == 0) VOP_UNLOCK(*vpp, 0); } else { err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, ct, direntflags, realpnp); } ZFS_EXIT(zfsvfs); return (err); } #ifdef illumos static int zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { /* * We only care about ACL_ENABLED so that libsec can * display ACL correctly and not default to POSIX draft. */ if (cmd == _PC_ACL_ENABLED) { *valp = _ACL_ACE_ENABLED; return (0); } return (fs_pathconf(vp, cmd, valp, cr, ct)); } #endif /* illumos */ #ifdef illumos static const fs_operation_def_t zfsctl_tops_root[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_IOCTL, { .error = fs_inval } }, { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { NULL } }; #endif /* illumos */ /* * Special case the handling of "..". */ /* ARGSUSED */ int zfsctl_freebsd_root_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; cred_t *cr = ap->a_cnp->cn_cred; int flags = ap->a_cnp->cn_flags; int nameiop = ap->a_cnp->cn_nameiop; char nm[NAME_MAX + 1]; int err; int ltype; if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) return (EOPNOTSUPP); ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL); if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) { ltype = VOP_ISLOCKED(dvp); if (flags & ISDOTDOT) { VN_HOLD(*vpp); VOP_UNLOCK(dvp, 0); } vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); if (flags & ISDOTDOT) { VN_RELE(*vpp); vn_lock(dvp, ltype| LK_RETRY); } } return (err); } static struct vop_vector zfsctl_ops_root = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_root_getattr, .vop_access = zfsctl_common_access, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_freebsd_root_lookup, .vop_inactive = VOP_NULL, .vop_reclaim = gfs_vop_reclaim, #ifdef TODO .vop_pathconf = zfsctl_pathconf, #endif .vop_fid = zfsctl_common_fid, }; /* * Gets the full dataset name that corresponds to the given snapshot name * Example: * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" */ static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; if (zfs_component_namecheck(name, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(os, zname); if (strlen(zname) + 1 + strlen(name) >= len) return (SET_ERROR(ENAMETOOLONG)); (void) strcat(zname, "@"); (void) strcat(zname, name); return (0); } static int zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) { vnode_t *svp = sep->se_root; int error; ASSERT(vn_ismntpt(svp)); /* this will be dropped by dounmount() */ if ((error = vn_vfswlock(svp)) != 0) return (error); #ifdef illumos VN_HOLD(svp); error = dounmount(vn_mountedvfs(svp), fflags, cr); if (error) { VN_RELE(svp); return (error); } /* * We can't use VN_RELE(), as that will try to invoke * zfsctl_snapdir_inactive(), which would cause us to destroy * the sd_lock mutex held by our caller. */ ASSERT(svp->v_count == 1); gfs_vop_reclaim(svp, cr, NULL); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); return (0); #else vfs_ref(vn_mountedvfs(svp)); return (dounmount(vn_mountedvfs(svp), fflags, curthread)); #endif } #ifdef illumos static void zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) { avl_index_t where; vfs_t *vfsp; refstr_t *pathref; char newpath[MAXNAMELEN]; char *tail; ASSERT(MUTEX_HELD(&sdp->sd_lock)); ASSERT(sep != NULL); vfsp = vn_mountedvfs(sep->se_root); ASSERT(vfsp != NULL); vfs_lock_wait(vfsp); /* * Change the name in the AVL tree. */ avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); avl_insert(&sdp->sd_snaps, sep, where); /* * Change the current mountpoint info: * - update the tail of the mntpoint path * - update the tail of the resource path */ pathref = vfs_getmntpoint(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '/')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setmntpoint(vfsp, newpath, 0); pathref = vfs_getresource(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); VERIFY((tail = strrchr(newpath, '@')) != NULL); *(tail+1) = '\0'; ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); vfs_setresource(vfsp, newpath, 0); vfs_unlock(vfsp); } #endif /* illumos */ #ifdef illumos /*ARGSUSED*/ static int zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { zfsctl_snapdir_t *sdp = sdvp->v_data; zfs_snapentry_t search, *sep; zfsvfs_t *zfsvfs; avl_index_t where; char from[MAXNAMELEN], to[MAXNAMELEN]; char real[MAXNAMELEN], fsname[MAXNAMELEN]; int err; zfsvfs = sdvp->v_vfsp->vfs_data; ZFS_ENTER(zfsvfs); if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, MAXNAMELEN, NULL); if (err == 0) { snm = real; } else if (err != ENOTSUP) { ZFS_EXIT(zfsvfs); return (err); } } ZFS_EXIT(zfsvfs); dmu_objset_name(zfsvfs->z_os, fsname); err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); if (err == 0) err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); if (err == 0) err = zfs_secpolicy_rename_perms(from, to, cr); if (err != 0) return (err); /* * Cannot move snapshots out of the snapdir. */ if (sdvp != tdvp) return (SET_ERROR(EINVAL)); if (strcmp(snm, tnm) == 0) return (0); mutex_enter(&sdp->sd_lock); search.se_name = (char *)snm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { mutex_exit(&sdp->sd_lock); return (SET_ERROR(ENOENT)); } err = dsl_dataset_rename_snapshot(fsname, snm, tnm, 0); if (err == 0) zfsctl_rename_snap(sdp, sep, tnm); mutex_exit(&sdp->sd_lock); return (err); } #endif /* illumos */ #ifdef illumos /* ARGSUSED */ static int zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, caller_context_t *ct, int flags) { zfsctl_snapdir_t *sdp = dvp->v_data; zfs_snapentry_t *sep; zfs_snapentry_t search; zfsvfs_t *zfsvfs; char snapname[MAXNAMELEN]; char real[MAXNAMELEN]; int err; zfsvfs = dvp->v_vfsp->vfs_data; ZFS_ENTER(zfsvfs); if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { err = dmu_snapshot_realname(zfsvfs->z_os, name, real, MAXNAMELEN, NULL); if (err == 0) { name = real; } else if (err != ENOTSUP) { ZFS_EXIT(zfsvfs); return (err); } } ZFS_EXIT(zfsvfs); err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); if (err == 0) err = zfs_secpolicy_destroy_perms(snapname, cr); if (err != 0) return (err); mutex_enter(&sdp->sd_lock); search.se_name = name; sep = avl_find(&sdp->sd_snaps, &search, NULL); if (sep) { avl_remove(&sdp->sd_snaps, sep); err = zfsctl_unmount_snap(sep, MS_FORCE, cr); if (err != 0) avl_add(&sdp->sd_snaps, sep); else err = dsl_destroy_snapshot(snapname, B_FALSE); } else { err = SET_ERROR(ENOENT); } mutex_exit(&sdp->sd_lock); return (err); } #endif /* illumos */ /* * This creates a snapshot under '.zfs/snapshot'. */ /* ARGSUSED */ static int zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; char name[MAXNAMELEN]; int err; static enum symfollow follow = NO_FOLLOW; static enum uio_seg seg = UIO_SYSSPACE; if (zfs_component_namecheck(dirname, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(zfsvfs->z_os, name); *vpp = NULL; err = zfs_secpolicy_snapshot_perms(name, cr); if (err != 0) return (err); if (err == 0) { err = dmu_objset_snapshot_one(name, dirname); if (err != 0) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); } return (err); } static int zfsctl_freebsd_snapdir_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL, ap->a_vpp, ap->a_cnp->cn_cred, NULL, 0, NULL)); } /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem vnode as necessary. * Perform a mount of the associated dataset on top of the vnode. */ /* ARGSUSED */ int zfsctl_snapdir_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; zfsctl_snapdir_t *sdp = dvp->v_data; objset_t *snap; char snapname[MAXNAMELEN]; char real[MAXNAMELEN]; char *mountpoint; zfs_snapentry_t *sep, search; size_t mountpoint_len; avl_index_t where; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; int ltype, flags = 0; /* * No extended attributes allowed under .zfs */ if (flags & LOOKUP_XATTR) return (SET_ERROR(EINVAL)); ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); ASSERT(dvp->v_type == VDIR); *vpp = NULL; /* * If we get a recursive call, that means we got called * from the domount() code while it was trying to look up the * spec (which looks like a local path for zfs). We need to * add some flag to domount() to tell it not to do this lookup. */ if (MUTEX_HELD(&sdp->sd_lock)) return (SET_ERROR(ENOENT)); ZFS_ENTER(zfsvfs); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + if (nm[0] == '.' && nm[1] == '.' && nm[2] =='\0') { + VOP_UNLOCK(dvp, 0); + VERIFY0(vn_lock(*vpp, LK_EXCLUSIVE)); + VERIFY0(vn_lock(dvp, LK_EXCLUSIVE)); + } ZFS_EXIT(zfsvfs); return (0); } if (flags & FIGNORECASE) { boolean_t conflict = B_FALSE; err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, MAXNAMELEN, &conflict); if (err == 0) { strlcpy(nm, real, sizeof(nm)); } else if (err != ENOTSUP) { ZFS_EXIT(zfsvfs); return (err); } #if 0 if (realpnp) (void) strlcpy(realpnp->pn_buf, nm, realpnp->pn_bufsize); if (conflict && direntflags) *direntflags = ED_CASE_CONFLICT; #endif } relookup: mutex_enter(&sdp->sd_lock); search.se_name = (char *)nm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { *vpp = sep->se_root; VN_HOLD(*vpp); err = traverse(vpp, LK_EXCLUSIVE | LK_RETRY); if (err != 0) { *vpp = NULL; } else if (*vpp == sep->se_root) { /* * The snapshot was unmounted behind our backs, * try to remount it. */ VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0); goto domount; } else { /* * VROOT was set during the traverse call. We need * to clear it since we're pretending to be part * of our parent's vfs. */ (*vpp)->v_flag &= ~VROOT; } mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (err); } /* * The requested snapshot is not currently mounted, look it up. */ err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); if (err != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); /* * handle "ls *" or "?" in a graceful manner, * forcing EILSEQ to ENOENT. * Since shell ultimately passes "*" or "?" as name to lookup */ return (err == EILSEQ ? ENOENT : err); } if (dmu_objset_hold(snapname, FTAG, &snap) != 0) { mutex_exit(&sdp->sd_lock); #ifdef illumos ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); #else /* !illumos */ /* Translate errors and add SAVENAME when needed. */ if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) { err = EJUSTRETURN; cnp->cn_flags |= SAVENAME; } else { err = SET_ERROR(ENOENT); } ZFS_EXIT(zfsvfs); return (err); #endif /* illumos */ } sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); avl_insert(&sdp->sd_snaps, sep, where); dmu_objset_rele(snap, FTAG); domount: mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1; mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); (void) snprintf(mountpoint, mountpoint_len, "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, nm); mutex_exit(&sdp->sd_lock); /* * The vnode may get reclaimed between dropping sd_lock and * getting the vnode lock. * */ err = vn_lock(*vpp, LK_EXCLUSIVE); if (err == ENOENT) goto relookup; VERIFY0(err); err = mount_snapshot(curthread, vpp, "zfs", mountpoint, snapname, 0); kmem_free(mountpoint, mountpoint_len); if (err == 0) { /* * Fix up the root vnode mounted on .zfs/snapshot/. * * This is where we lie about our v_vfsp in order to * make .zfs/snapshot/ accessible over NFS * without requiring manual mounts of . */ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; (*vpp)->v_flag &= ~VROOT; } ZFS_EXIT(zfsvfs); #ifdef illumos /* * If we had an error, drop our hold on the vnode and * zfsctl_snapshot_inactive() will clean up. */ if (err != 0) { VN_RELE(*vpp); *vpp = NULL; } #else if (err != 0) *vpp = NULL; #endif return (err); } /* ARGSUSED */ int zfsctl_shares_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; char nm[NAME_MAX + 1]; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + if (nm[0] == '.' && nm[1] == '.' && nm[2] =='\0') { + VOP_UNLOCK(dvp, 0); + VERIFY0(vn_lock(*vpp, LK_EXCLUSIVE)); + VERIFY0(vn_lock(dvp, LK_EXCLUSIVE)); + } ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp); VN_RELE(ZTOV(dzp)); } ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, offset_t *offp, offset_t *nextp, void *data, int flags) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; char snapname[MAXNAMELEN]; uint64_t id, cookie; boolean_t case_conflict; int error; ZFS_ENTER(zfsvfs); cookie = *offp; dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &cookie, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error) { ZFS_EXIT(zfsvfs); if (error == ENOENT) { *eofp = 1; return (0); } return (error); } if (flags & V_RDDIR_ENTFLAGS) { edirent_t *eodp = dp; (void) strcpy(eodp->ed_name, snapname); eodp->ed_ino = ZFSCTL_INO_SNAP(id); eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; } else { struct dirent64 *odp = dp; (void) strcpy(odp->d_name, snapname); odp->d_ino = ZFSCTL_INO_SNAP(id); } *nextp = cookie; ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfsctl_shares_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; cred_t *cr = ap->a_cred; int *eofp = ap->a_eofflag; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY); error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_ncookies, ap->a_cookies); VN_URELE(ZTOV(dzp)); } else { *eofp = 1; error = SET_ERROR(ENOENT); } ZFS_EXIT(zfsvfs); return (error); } /* * pvp is the '.zfs' directory (zfsctl_node_t). * * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). * * This function is the callback to create a GFS vnode for '.zfs/snapshot' * when a lookup is performed on .zfs for "snapshot". */ vnode_t * zfsctl_mknode_snapdir(vnode_t *pvp) { vnode_t *vp; zfsctl_snapdir_t *sdp; vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, zfsctl_snapdir_readdir_cb, NULL); sdp = vp->v_data; sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&sdp->sd_snaps, snapentry_compare, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); VOP_UNLOCK(vp, 0); return (vp); } vnode_t * zfsctl_mknode_shares(vnode_t *pvp) { vnode_t *vp; zfsctl_node_t *sdp; vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, NULL, NULL); sdp = vp->v_data; sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; VOP_UNLOCK(vp, 0); return (vp); } /* ARGSUSED */ static int zfsctl_shares_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cr = ap->a_cred; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *dzp; int error; ZFS_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY); error = VOP_GETATTR(ZTOV(dzp), vap, cr); VN_URELE(ZTOV(dzp)); } ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfsctl_snapdir_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_snapdir_t *sdp = vp->v_data; ZFS_ENTER(zfsvfs); zfsctl_common_getattr(vp, vap); vap->va_nodeid = gfs_file_inode(vp); vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); vap->va_birthtime = vap->va_ctime; ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfsctl_snapdir_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfsctl_snapdir_t *sdp = vp->v_data; zfs_snapentry_t *sep; /* * On forced unmount we have to free snapshots from here. */ mutex_enter(&sdp->sd_lock); while ((sep = avl_first(&sdp->sd_snaps)) != NULL) { avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); } mutex_exit(&sdp->sd_lock); gfs_dir_inactive(vp); ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); mutex_destroy(&sdp->sd_lock); avl_destroy(&sdp->sd_snaps); kmem_free(sdp, sizeof (zfsctl_snapdir_t)); return (0); } #ifdef illumos static const fs_operation_def_t zfsctl_tops_snapdir[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_IOCTL, { .error = fs_inval } }, { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { NULL } }; static const fs_operation_def_t zfsctl_tops_shares[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_IOCTL, { .error = fs_inval } }, { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, { NULL } }; #else /* !illumos */ static struct vop_vector zfsctl_ops_snapdir = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_snapdir_getattr, .vop_access = zfsctl_common_access, .vop_mkdir = zfsctl_freebsd_snapdir_mkdir, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_snapdir_lookup, .vop_inactive = zfsctl_snapdir_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, }; static struct vop_vector zfsctl_ops_shares = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_shares_getattr, .vop_access = zfsctl_common_access, .vop_readdir = zfsctl_shares_readdir, .vop_lookup = zfsctl_shares_lookup, .vop_inactive = VOP_NULL, .vop_reclaim = gfs_vop_reclaim, .vop_fid = zfsctl_shares_fid, }; #endif /* illumos */ /* * pvp is the GFS vnode '.zfs/snapshot'. * * This creates a GFS node under '.zfs/snapshot' representing each * snapshot. This newly created GFS node is what we mount snapshot * vfs_t's ontop of. */ static vnode_t * zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) { vnode_t *vp; zfsctl_node_t *zcp; vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = objset; VOP_UNLOCK(vp, 0); return (vp); } static int zfsctl_snapshot_reclaim(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; cred_t *cr = ap->a_td->td_ucred; struct vop_reclaim_args iap; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int locked; vnode_t *dvp; VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); sdp = dvp->v_data; - VOP_UNLOCK(dvp, 0); /* this may already have been unmounted */ if (sdp == NULL) { VN_RELE(dvp); return (0); } if (!(locked = MUTEX_HELD(&sdp->sd_lock))) mutex_enter(&sdp->sd_lock); ASSERT(!vn_ismntpt(vp)); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { next = AVL_NEXT(&sdp->sd_snaps, sep); if (sep->se_root == vp) { avl_remove(&sdp->sd_snaps, sep); kmem_free(sep->se_name, strlen(sep->se_name) + 1); kmem_free(sep, sizeof (zfs_snapentry_t)); break; } sep = next; } ASSERT(sep != NULL); if (!locked) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); /* * Dispose of the vnode for the snapshot mount point. * This is safe to do because once this entry has been removed * from the AVL tree, it can't be found again, so cannot become * "active". If we lookup the same name again we will end up * creating a new vnode. */ iap.a_vp = vp; gfs_vop_reclaim(&iap); return (0); } static int zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) { zfsvfs_t *zfsvfs = ap->a_vp->v_vfsp->vfs_data; vnode_t *dvp, *vp; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, kcred, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { vp = sep->se_root; if (vp == ap->a_vp) break; sep = AVL_NEXT(&sdp->sd_snaps, sep); } if (sep == NULL) { mutex_exit(&sdp->sd_lock); error = ENOENT; } else { size_t len; len = strlen(sep->se_name); *ap->a_buflen -= len; bcopy(sep->se_name, ap->a_buf + *ap->a_buflen, len); mutex_exit(&sdp->sd_lock); vref(dvp); *ap->a_vpp = dvp; } VN_RELE(dvp); return (error); } /* * These VP's should never see the light of day. They should always * be covered. */ static struct vop_vector zfsctl_ops_snapshot = { .vop_default = &default_vnodeops, .vop_inactive = VOP_NULL, .vop_reclaim = zfsctl_snapshot_reclaim, .vop_vptocnp = zfsctl_snapshot_vptocnp, }; int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp, *vp; zfsctl_snapdir_t *sdp; zfsctl_node_t *zcp; zfs_snapentry_t *sep; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, kcred, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { vp = sep->se_root; zcp = vp->v_data; if (zcp->zc_id == objsetid) break; sep = AVL_NEXT(&sdp->sd_snaps, sep); } if (sep != NULL) { VN_HOLD(vp); /* * Return the mounted root rather than the covered mount point. * Takes the GFS vnode at .zfs/snapshot/ * and returns the ZFS vnode mounted on top of the GFS node. * This ZFS vnode is the root of the vfs for objset 'objsetid'. */ error = traverse(&vp, LK_SHARED | LK_RETRY); if (error == 0) { if (vp == sep->se_root) { VN_RELE(vp); /* release covered vp */ error = SET_ERROR(EINVAL); } else { *zfsvfsp = VTOZ(vp)->z_zfsvfs; VN_URELE(vp); /* put snapshot's root vp */ } } mutex_exit(&sdp->sd_lock); } else { error = SET_ERROR(EINVAL); mutex_exit(&sdp->sd_lock); } VN_RELE(dvp); return (error); } /* * Unmount any snapshots for the given filesystem. This is called from * zfs_umount() - if we have a ctldir, then go through and unmount all the * snapshots. */ int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { zfsvfs_t *zfsvfs = vfsp->vfs_data; vnode_t *dvp; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, NULL, 0, NULL, cr, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; mutex_enter(&sdp->sd_lock); sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { next = AVL_NEXT(&sdp->sd_snaps, sep); /* * If this snapshot is not mounted, then it must * have just been unmounted by somebody else, and * will be cleaned up by zfsctl_snapdir_inactive(). */ if (vn_ismntpt(sep->se_root)) { error = zfsctl_unmount_snap(sep, fflags, cr); if (error) { avl_index_t where; /* * Before reinserting snapshot to the tree, * check if it was actually removed. For example * when snapshot mount point is busy, we will * have an error here, but there will be no need * to reinsert snapshot. */ if (avl_find(&sdp->sd_snaps, sep, &where) == NULL) avl_insert(&sdp->sd_snaps, sep, where); break; } } sep = next; } mutex_exit(&sdp->sd_lock); VN_RELE(dvp); return (error); }