diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 3b405e9d68eb..28d98e788263 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -1,1361 +1,1364 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. */ /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' directory, but this may expand in the * future. The elements are built using the GFS primitives, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding vnode. * * All mounts are handled automatically by the kernel, but unmounts are * (currently) handled from user land. The main reason is that there is no * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. * * The '.zfs', '.zfs/snapshot', and all directories created under * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and * share the same vfs_t as the head filesystem (what '.zfs' lives under). * * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. * However, vnodes within these mounted on file systems have their v_vfsp * fields set to the head filesystem to make NFS happy (see * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t * so that it cannot be freed until all snapshots have been unmounted. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include #include /* Common access mode for all virtual directories under the ctldir */ const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; /* * "Synthetic" filesystem implementation. */ /* * Assert that A implies B. */ #define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); typedef struct sfs_node { char sn_name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t sn_parent_id; uint64_t sn_id; } sfs_node_t; /* * Check the parent's ID as well as the node's to account for a chance * that IDs originating from different domains (snapshot IDs, artificial * IDs, znode IDs) may clash. */ static int sfs_compare_ids(struct vnode *vp, void *arg) { sfs_node_t *n1 = vp->v_data; sfs_node_t *n2 = arg; bool equal; equal = n1->sn_id == n2->sn_id && n1->sn_parent_id == n2->sn_parent_id; /* Zero means equality. */ return (!equal); } static int sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, uint64_t id, struct vnode **vpp) { sfs_node_t search; int err; search.sn_id = id; search.sn_parent_id = parent_id; err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp, sfs_compare_ids, &search); return (err); } static int sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, uint64_t id, struct vnode **vpp) { int err; KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp, sfs_compare_ids, vp->v_data); return (err); } static void sfs_vnode_remove(struct vnode *vp) { vfs_hash_remove(vp); } typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); static int sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, const char *tag, struct vop_vector *vops, sfs_vnode_setup_fn setup, void *arg, struct vnode **vpp) { struct vnode *vp; int error; error = sfs_vnode_get(mp, flags, parent_id, id, vpp); if (error != 0 || *vpp != NULL) { KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, "sfs vnode with no data"); return (error); } /* Allocate a new vnode/inode. */ error = getnewvnode(tag, mp, vops, &vp); if (error != 0) { *vpp = NULL; return (error); } /* * Exclusively lock the vnode vnode while it's being constructed. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { *vpp = NULL; return (error); } setup(vp, arg); error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); if (error != 0 || *vpp != NULL) { KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, "sfs vnode with no data"); return (error); } *vpp = vp; return (0); } static void sfs_print_node(sfs_node_t *node) { printf("\tname = %s\n", node->sn_name); printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); printf("\tid = %ju\n", (uintmax_t)node->sn_id); } static sfs_node_t * sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) { struct sfs_node *node; KASSERT(strlen(name) < sizeof (node->sn_name), ("sfs node name is too long")); KASSERT(size >= sizeof (*node), ("sfs node size is too small")); node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); strlcpy(node->sn_name, name, sizeof (node->sn_name)); node->sn_parent_id = parent_id; node->sn_id = id; return (node); } static void sfs_destroy_node(sfs_node_t *node) { free(node, M_SFSNODES); } static void * sfs_reclaim_vnode(vnode_t *vp) { void *data; sfs_vnode_remove(vp); data = vp->v_data; vp->v_data = NULL; return (data); } static int sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, zfs_uio_t *uio, off_t *offp) { struct dirent entry; int error; /* Reset ncookies for subsequent use of vfs_read_dirent. */ if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; if (zfs_uio_resid(uio) < sizeof (entry)) return (SET_ERROR(EINVAL)); if (zfs_uio_offset(uio) < 0) return (SET_ERROR(EINVAL)); if (zfs_uio_offset(uio) == 0) { entry.d_fileno = id; entry.d_type = DT_DIR; entry.d_name[0] = '.'; entry.d_name[1] = '\0'; entry.d_namlen = 1; entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio)); if (error != 0) return (SET_ERROR(error)); } if (zfs_uio_offset(uio) < sizeof (entry)) return (SET_ERROR(EINVAL)); if (zfs_uio_offset(uio) == sizeof (entry)) { entry.d_fileno = parent_id; entry.d_type = DT_DIR; entry.d_name[0] = '.'; entry.d_name[1] = '.'; entry.d_name[2] = '\0'; entry.d_namlen = 2; entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio)); if (error != 0) return (SET_ERROR(error)); } if (offp != NULL) *offp = 2 * sizeof (entry); return (0); } /* * .zfs inode namespace * * We need to generate unique inode numbers for all files and directories * within the .zfs pseudo-filesystem. We use the following scheme: * * ENTRY ZFSCTL_INODE * .zfs 1 * .zfs/snapshot 2 * .zfs/snapshot/ objectid(snap) */ #define ZFSCTL_INO_SNAP(id) (id) static struct vop_vector zfsctl_ops_root; static struct vop_vector zfsctl_ops_snapdir; static struct vop_vector zfsctl_ops_snapshot; void zfsctl_init(void) { } void zfsctl_fini(void) { } boolean_t zfsctl_is_node(vnode_t *vp) { return (vn_matchops(vp, zfsctl_ops_root) || vn_matchops(vp, zfsctl_ops_snapdir) || vn_matchops(vp, zfsctl_ops_snapshot)); } typedef struct zfsctl_root { sfs_node_t node; sfs_node_t *snapdir; timestruc_t cmtime; } zfsctl_root_t; /* * Create the '.zfs' directory. */ void zfsctl_create(zfsvfs_t *zfsvfs) { zfsctl_root_t *dot_zfs; sfs_node_t *snapdir; vnode_t *rvp; uint64_t crtime[2]; ASSERT3P(zfsvfs->z_ctldir, ==, NULL); snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR); dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0, ZFSCTL_INO_ROOT); dot_zfs->snapdir = snapdir; VERIFY0(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp)); VERIFY0(sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), &crtime, sizeof (crtime))); ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); vput(rvp); zfsvfs->z_ctldir = dot_zfs; } /* * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. * The nodes must not have any associated vnodes by now as they should be * vflush-ed. */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { sfs_destroy_node(zfsvfs->z_ctldir->snapdir); sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } static int zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, struct vnode **vpp) { return (VFS_ROOT(mp, flags, vpp)); } static void zfsctl_common_vnode_setup(vnode_t *vp, void *arg) { ASSERT_VOP_ELOCKED(vp, __func__); /* We support shared locking. */ VN_LOCK_ASHARE(vp); vp->v_type = VDIR; vp->v_data = arg; } static int zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, struct vnode **vpp) { void *node; int err; node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir; err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, zfsctl_common_vnode_setup, node, vpp); return (err); } static int zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, struct vnode **vpp) { void *node; int err; node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir; err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); return (err); } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ int zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) { int error; error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); return (error); } /* * Common open routine. Disallow any write access. */ static int zfsctl_common_open(struct vop_open_args *ap) { int flags = ap->a_mode; if (flags & FWRITE) return (SET_ERROR(EACCES)); return (0); } /* * Common close routine. Nothing to do here. */ /* ARGSUSED */ static int zfsctl_common_close(struct vop_close_args *ap) { return (0); } /* * Common access routine. Disallow writes. */ static int zfsctl_common_access(struct vop_access_args *ap) { accmode_t accmode = ap->a_accmode; if (accmode & VWRITE) return (SET_ERROR(EACCES)); return (0); } /* * Common getattr function. Fill in basic information. */ static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { timestruc_t now; sfs_node_t *node; node = vp->v_data; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* * We are a purely virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; vap->va_nblocks = 0; vap->va_seq = 0; vn_fsid(vp, vap); vap->va_mode = zfsctl_ctldir_mode; vap->va_type = VDIR; /* * We live in the now (for atime). */ gethrestime(&now); vap->va_atime = now; /* FreeBSD: Reset chflags(2) flags. */ vap->va_flags = 0; vap->va_nodeid = node->sn_id; /* At least '.' and '..'. */ vap->va_nlink = 2; } #ifndef _OPENSOLARIS_SYS_VNODE_H_ struct vop_fid_args { struct vnode *a_vp; struct fid *a_fid; }; #endif static int zfsctl_common_fid(struct vop_fid_args *ap) { vnode_t *vp = ap->a_vp; fid_t *fidp = (void *)ap->a_fid; sfs_node_t *node = vp->v_data; uint64_t object = node->sn_id; zfid_short_t *zfid; int i; zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs nodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfsctl_common_reclaim(struct vop_reclaim_args *ap) { vnode_t *vp = ap->a_vp; (void) sfs_reclaim_vnode(vp); return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_print_args { struct vnode *a_vp; }; #endif static int zfsctl_common_print(struct vop_print_args *ap) { sfs_print_node(ap->a_vp->v_data); return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get root directory attributes. */ static int zfsctl_root_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsctl_root_t *node = vp->v_data; zfsctl_common_getattr(vp, vap); vap->va_ctime = node->cmtime; vap->va_mtime = vap->va_ctime; vap->va_birthtime = vap->va_ctime; vap->va_nlink += 1; /* snapdir */ vap->va_size = vap->va_nlink; return (0); } /* * When we lookup "." we still can be asked to lock it * differently, can't we? */ static int zfsctl_relock_dot(vnode_t *dvp, int ltype) { vref(dvp); if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* Relock for the "." case may left us with reclaimed vnode. */ if (VN_IS_DOOMED(dvp)) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } /* * Special case the handling of "..". */ static int zfsctl_root_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; int flags = ap->a_cnp->cn_flags; int lkflags = ap->a_cnp->cn_lkflags; int nameiop = ap->a_cnp->cn_nameiop; int err; ASSERT3S(dvp->v_type, ==, VDIR); if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); if (err == 0) *vpp = dvp; } else if ((flags & ISDOTDOT) != 0) { err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, lkflags, vpp); } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); } else { err = SET_ERROR(ENOENT); } if (err != 0) *vpp = NULL; return (err); } static int zfsctl_root_readdir(struct vop_readdir_args *ap) { struct dirent entry; vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_root_t *node = vp->v_data; zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; int error; zfs_uio_init(&uio, ap->a_uio); ASSERT3S(vp->v_type, ==, VDIR); error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio, &dots_offset); if (error != 0) { if (error == ENAMETOOLONG) /* ran out of destination space */ error = 0; return (error); } if (zfs_uio_offset(&uio) != dots_offset) return (SET_ERROR(EINVAL)); CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name)); entry.d_fileno = node->snapdir->sn_id; entry.d_type = DT_DIR; strcpy(entry.d_name, node->snapdir->sn_name); entry.d_namlen = strlen(entry.d_name); entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { if (error == ENAMETOOLONG) error = 0; return (SET_ERROR(error)); } if (eofp != NULL) *eofp = 1; return (0); } static int zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) { static const char dotzfs_name[4] = ".zfs"; vnode_t *dvp; int error; if (*ap->a_buflen < sizeof (dotzfs_name)) return (SET_ERROR(ENOMEM)); error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, LK_SHARED, &dvp); if (error != 0) return (SET_ERROR(error)); VOP_UNLOCK1(dvp); *ap->a_vpp = dvp; *ap->a_buflen -= sizeof (dotzfs_name); bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); return (0); } static int zfsctl_common_pathconf(struct vop_pathconf_args *ap) { /* * We care about ACL variables so that user land utilities like ls * can display them correctly. Since the ctldir's st_dev is set to be * the same as the parent dataset, we must support all variables that * it supports. */ switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_MIN_HOLE_SIZE: *ap->a_retval = (int)SPA_MINBLOCKSIZE; return (0); case _PC_ACL_EXTENDED: *ap->a_retval = 0; return (0); case _PC_ACL_NFS4: *ap->a_retval = 1; return (0); case _PC_ACL_PATH_MAX: *ap->a_retval = ACL_MAX_ENTRIES; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); default: return (vop_stdpathconf(ap)); } } /* * Returns a trivial ACL */ static int zfsctl_common_getacl(struct vop_getacl_args *ap) { int i; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); /* * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify * attributes. That is not the case for the ctldir, so we must clear * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs * aren't supported by the ctldir. */ for (i = 0; i < ap->a_aclp->acl_cnt; i++) { struct acl_entry *entry; entry = &(ap->a_aclp->acl_entry[i]); entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | ACL_READ_NAMED_ATTRS); } return (0); } static struct vop_vector zfsctl_ops_root = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, #endif .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_root_getattr, .vop_access = zfsctl_common_access, .vop_readdir = zfsctl_root_readdir, .vop_lookup = zfsctl_root_lookup, .vop_inactive = VOP_NULL, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, .vop_print = zfsctl_common_print, .vop_vptocnp = zfsctl_root_vptocnp, .vop_pathconf = zfsctl_common_pathconf, .vop_getacl = zfsctl_common_getacl, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root); static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; dmu_objset_name(os, zname); if (strlen(zname) + 1 + strlen(name) >= len) return (SET_ERROR(ENAMETOOLONG)); (void) strcat(zname, "@"); (void) strcat(zname, name); return (0); } static int zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; int err; err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); return (err); } /* * Given a vnode get a root vnode of a filesystem mounted on top of * the vnode, if any. The root vnode is referenced and locked. * If no filesystem is mounted then the orinal vnode remains referenced * and locked. If any error happens the orinal vnode is unlocked and * released. */ static int zfsctl_mounted_here(vnode_t **vpp, int flags) { struct mount *mp; int err; ASSERT_VOP_LOCKED(*vpp, __func__); ASSERT3S((*vpp)->v_type, ==, VDIR); if ((mp = (*vpp)->v_mountedhere) != NULL) { err = vfs_busy(mp, 0); KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); vput(*vpp); err = VFS_ROOT(mp, flags, vpp); vfs_unbusy(mp); return (err); } return (EJUSTRETURN); } typedef struct { const char *snap_name; uint64_t snap_id; } snapshot_setup_arg_t; static void zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) { snapshot_setup_arg_t *ssa = arg; sfs_node_t *node; ASSERT_VOP_ELOCKED(vp, __func__); node = sfs_alloc_node(sizeof (sfs_node_t), ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); zfsctl_common_vnode_setup(vp, node); /* We have to support recursive locking. */ VN_LOCK_AREC(vp); } /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem vnode as necessary. * Perform a mount of the associated dataset on top of the vnode. * There are four possibilities: * - the snapshot node and vnode do not exist * - the snapshot vnode is covered by the mounted snapshot * - the snapshot vnode is not covered yet, the mount operation is in progress * - the snapshot vnode is not covered, because the snapshot has been unmounted * The last two states are transient and should be relatively short-lived. */ static int zfsctl_snapdir_lookup(struct vop_lookup_args *ap) { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; char name[NAME_MAX + 1]; char fullname[ZFS_MAX_DATASET_NAME_LEN]; char *mountpoint; size_t mountpoint_len; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; uint64_t snap_id; int nameiop = cnp->cn_nameiop; int lkflags = cnp->cn_lkflags; int flags = cnp->cn_flags; int err; ASSERT3S(dvp->v_type, ==, VDIR); if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); if (err == 0) *vpp = dvp; return (err); } if (flags & ISDOTDOT) { err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, vpp); return (err); } if (cnp->cn_namelen >= sizeof (name)) return (SET_ERROR(ENAMETOOLONG)); strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); err = zfsctl_snapshot_lookup(dvp, name, &snap_id); if (err != 0) return (SET_ERROR(ENOENT)); for (;;) { snapshot_setup_arg_t ssa; ssa.snap_name = name; ssa.snap_id = snap_id; err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, snap_id, "zfs", &zfsctl_ops_snapshot, zfsctl_snapshot_vnode_setup, &ssa, vpp); if (err != 0) return (err); /* Check if a new vnode has just been created. */ if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) break; /* * Check if a snapshot is already mounted on top of the vnode. */ err = zfsctl_mounted_here(vpp, lkflags); if (err != EJUSTRETURN) return (err); /* * If the vnode is not covered, then either the mount operation * is in progress or the snapshot has already been unmounted * but the vnode hasn't been inactivated and reclaimed yet. * We can try to re-use the vnode in the latter case. */ VI_LOCK(*vpp); if (((*vpp)->v_iflag & VI_MOUNT) == 0) { /* * Upgrade to exclusive lock in order to: * - avoid race conditions * - satisfy the contract of mount_snapshot() */ err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); if (err == 0) break; } else { VI_UNLOCK(*vpp); } /* * In this state we can loop on uncontested locks and starve * the thread doing the lengthy, non-trivial mount operation. * So, yield to prevent that from happening. */ vput(*vpp); kern_yield(PRI_USER); } VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname)); mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); (void) snprintf(mountpoint, mountpoint_len, "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, name); err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); kmem_free(mountpoint, mountpoint_len); if (err == 0) { /* * Fix up the root vnode mounted on .zfs/snapshot/. * * This is where we lie about our v_vfsp in order to * make .zfs/snapshot/ accessible over NFS * without requiring manual mounts of . */ ASSERT3P(VTOZ(*vpp)->z_zfsvfs, !=, zfsvfs); VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; /* Clear the root flag (set via VFS_ROOT) as well. */ (*vpp)->v_vflag &= ~VV_ROOT; } if (err != 0) *vpp = NULL; return (err); } static int zfsctl_snapdir_readdir(struct vop_readdir_args *ap) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; struct dirent entry; vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; int error; zfs_uio_init(&uio, ap->a_uio); ASSERT3S(vp->v_type, ==, VDIR); error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, &uio, &dots_offset); if (error != 0) { if (error == ENAMETOOLONG) /* ran out of destination space */ error = 0; return (error); } ZFS_ENTER(zfsvfs); for (;;) { uint64_t cookie; uint64_t id; cookie = zfs_uio_offset(&uio) - dots_offset; dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), snapname, &id, &cookie, NULL); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error != 0) { if (error == ENOENT) { if (eofp != NULL) *eofp = 1; error = 0; } ZFS_EXIT(zfsvfs); return (error); } entry.d_fileno = id; entry.d_type = DT_DIR; strcpy(entry.d_name, snapname); entry.d_namlen = strlen(entry.d_name); entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { if (error == ENAMETOOLONG) error = 0; ZFS_EXIT(zfsvfs); return (SET_ERROR(error)); } zfs_uio_setoffset(&uio, cookie + dots_offset); } __builtin_unreachable(); } static int zfsctl_snapdir_getattr(struct vop_getattr_args *ap) { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; dsl_dataset_t *ds; uint64_t snap_count; int err; ZFS_ENTER(zfsvfs); ds = dmu_objset_ds(zfsvfs->z_os); zfsctl_common_getattr(vp, vap); vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); vap->va_mtime = vap->va_ctime; vap->va_birthtime = vap->va_ctime; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (err != 0) { ZFS_EXIT(zfsvfs); return (err); } vap->va_nlink += snap_count; } vap->va_size = vap->va_nlink; ZFS_EXIT(zfsvfs); return (0); } static struct vop_vector zfsctl_ops_snapdir = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, #endif .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, .vop_getattr = zfsctl_snapdir_getattr, .vop_access = zfsctl_common_access, .vop_readdir = zfsctl_snapdir_readdir, .vop_lookup = zfsctl_snapdir_lookup, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, .vop_print = zfsctl_common_print, .vop_pathconf = zfsctl_common_pathconf, .vop_getacl = zfsctl_common_getacl, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir); static int zfsctl_snapshot_inactive(struct vop_inactive_args *ap) { vnode_t *vp = ap->a_vp; VERIFY3S(vrecycle(vp), ==, 1); return (0); } static int zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap) { vnode_t *vp = ap->a_vp; void *data = vp->v_data; sfs_reclaim_vnode(vp); sfs_destroy_node(data); return (0); } static int zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) { struct mount *mp; vnode_t *dvp; vnode_t *vp; sfs_node_t *node; size_t len; int locked; int error; vp = ap->a_vp; node = vp->v_data; len = strlen(node->sn_name); if (*ap->a_buflen < len) return (SET_ERROR(ENOMEM)); /* * Prevent unmounting of the snapshot while the vnode lock * is not held. That is not strictly required, but allows * us to assert that an uncovered snapshot vnode is never * "leaked". */ mp = vp->v_mountedhere; if (mp == NULL) return (SET_ERROR(ENOENT)); error = vfs_busy(mp, 0); KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); /* * We can vput the vnode as we can now depend on the reference owned * by the busied mp. But we also need to hold the vnode, because * the reference may go after vfs_unbusy() which has to be called * before we can lock the vnode again. */ locked = VOP_ISLOCKED(vp); #if __FreeBSD_version >= 1300045 enum vgetstate vs = vget_prep(vp); #else vhold(vp); #endif vput(vp); /* Look up .zfs/snapshot, our parent. */ error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); if (error == 0) { VOP_UNLOCK1(dvp); *ap->a_vpp = dvp; *ap->a_buflen -= len; bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); } vfs_unbusy(mp); #if __FreeBSD_version >= 1300045 vget_finish(vp, locked | LK_RETRY, vs); #else vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); #endif return (error); } /* * These VP's should never see the light of day. They should always * be covered. */ static struct vop_vector zfsctl_ops_snapshot = { .vop_default = NULL, /* ensure very restricted access */ #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, #endif .vop_inactive = zfsctl_snapshot_inactive, #if __FreeBSD_version >= 1300045 .vop_need_inactive = vop_stdneed_inactive, #endif .vop_reclaim = zfsctl_snapshot_reclaim, .vop_vptocnp = zfsctl_snapshot_vptocnp, .vop_lock1 = vop_stdlock, .vop_unlock = vop_stdunlock, .vop_islocked = vop_stdislocked, .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ .vop_print = zfsctl_common_print, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot); int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) { zfsvfs_t *zfsvfs __unused = vfsp->vfs_data; vnode_t *vp; int error; ASSERT3P(zfsvfs->z_ctldir, !=, NULL); *zfsvfsp = NULL; error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, ZFSCTL_INO_SNAPDIR, objsetid, &vp); if (error == 0 && vp != NULL) { /* * XXX Probably need to at least reference, if not busy, the mp. */ if (vp->v_mountedhere != NULL) *zfsvfsp = vp->v_mountedhere->mnt_data; vput(vp); } if (*zfsvfsp == NULL) return (SET_ERROR(EINVAL)); return (0); } /* * Unmount any snapshots for the given filesystem. This is called from * zfs_umount() - if we have a ctldir, then go through and unmount all the * snapshots. */ int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; zfsvfs_t *zfsvfs = vfsp->vfs_data; struct mount *mp; vnode_t *vp; uint64_t cookie; int error; ASSERT3P(zfsvfs->z_ctldir, !=, NULL); cookie = 0; for (;;) { uint64_t id; dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), snapname, &id, &cookie, NULL); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error != 0) { if (error == ENOENT) error = 0; break; } for (;;) { error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, ZFSCTL_INO_SNAPDIR, id, &vp); if (error != 0 || vp == NULL) break; mp = vp->v_mountedhere; /* * v_mountedhere being NULL means that the * (uncovered) vnode is in a transient state * (mounting or unmounting), so loop until it * settles down. */ if (mp != NULL) break; vput(vp); } if (error != 0) break; if (vp == NULL) continue; /* no mountpoint, nothing to do */ /* * The mount-point vnode is kept locked to avoid spurious EBUSY * from a concurrent umount. * The vnode lock must have recursive locking enabled. */ vfs_ref(mp); error = dounmount(mp, fflags, curthread); KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, ("extra references after unmount")); vput(vp); if (error != 0) break; } KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, ("force unmounting failed")); return (error); } int zfsctl_snapshot_unmount(const char *snapname, int flags __unused) { vfs_t *vfsp = NULL; zfsvfs_t *zfsvfs = NULL; if (strchr(snapname, '@') == NULL) return (0); int err = getzfsvfs(snapname, &zfsvfs); if (err != 0) { ASSERT3P(zfsvfs, ==, NULL); return (0); } vfsp = zfsvfs->z_vfs; ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); vfs_ref(vfsp); vfs_unbusy(vfsp); return (dounmount(vfsp, MS_FORCE, curthread)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 11bbda5452a1..2f7019b92498 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1,6219 +1,6222 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 1300102 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef VN_OPEN_INVFS #define VN_OPEN_INVFS 0x0 #endif VFS_SMR_DECLARE; #if __FreeBSD_version >= 1300047 #define vm_page_wire_lock(pp) #define vm_page_wire_unlock(pp) #else #define vm_page_wire_lock(pp) vm_page_lock(pp) #define vm_page_wire_unlock(pp) vm_page_unlock(pp) #endif #ifdef DEBUG_VFS_LOCKS #define VNCHECKREF(vp) \ VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ ("%s: wrong ref counts", __func__)); #else #define VNCHECKREF(vp) #endif /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, int *rvalp) { loff_t off; int error; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case F_SEEK_DATA: case F_SEEK_HOLE: { off = *(offset_t *)data; /* offset parameter is in/out */ error = zfs_holey(VTOZ(vp), com, &off); if (error) return (error); *(offset_t *)data = off; return (0); } } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considered clean despite have some * dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked_12(obj); #if __FreeBSD_version < 1300050 for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp != NULL) { ASSERT(!pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } #else vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } #endif return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); #if __FreeBSD_version >= 1300041 vm_object_pip_wakeup(pp->object); #else vm_object_pip_subtract(pp->object, 1); #endif } #if __FreeBSD_version > 1300051 static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t m; obj = vp->v_object; vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY); return (m); } #else static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_wire_lock(pp); vm_page_hold(pp); vm_page_wire_unlock(pp); } else pp = NULL; break; } return (pp); } #endif static void page_unhold(vm_page_t pp) { vm_page_wire_lock(pp); #if __FreeBSD_version >= 1300035 vm_page_unwire(pp, PQ_ACTIVE); #else vm_page_unhold(pp); #endif vm_page_wire_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { vm_object_t obj; struct sf_buf *sf; vnode_t *vp = ZTOV(zp); caddr_t va; int off; ASSERT3P(vp->v_mount, !=, NULL); obj = vp->v_object; ASSERT3P(obj, !=, NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); #if __FreeBSD_version >= 1300041 vm_object_pip_add(obj, 1); #endif for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, zp->z_id, start + off, nbytes, va + off, DMU_READ_PREFETCH); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unbusy(pp); } len -= nbytes; off = 0; } #if __FreeBSD_version >= 1300041 vm_object_pip_wakeup(obj); #else vm_object_pip_wakeupn(obj, 0); #endif zfs_vmobject_wunlock_12(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ int mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) { vnode_t *vp = ZTOV(zp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int error = 0; ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY); ASSERT3P(vp->v_mount, !=, NULL); obj = vp->v_object; ASSERT3P(obj, !=, NULL); ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET); zfs_vmobject_wlock_12(obj); for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (vm_page_none_valid(pp)) { zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); #if __FreeBSD_version >= 1300081 if (error == 0) { vm_page_valid(pp); vm_page_activate(pp); vm_page_do_sunbusy(pp); } else { zfs_vmobject_wlock(obj); if (!vm_page_wired(pp) && pp->valid == 0 && vm_page_busy_tryupgrade(pp)) vm_page_free(pp); else vm_page_sunbusy(pp); zfs_vmobject_wunlock(obj); } #else vm_page_do_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); #endif } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_do_sunbusy(pp); } if (error) break; zfs_uio_advance(uio, bytes); len -= bytes; } zfs_vmobject_wunlock_12(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { vnode_t *vp = ZTOV(zp); vm_object_t obj; int64_t start; int len = nbytes; int off; int error = 0; ASSERT3P(vp->v_mount, !=, NULL); obj = vp->v_object; ASSERT3P(obj, !=, NULL); start = zfs_uio_offset(uio); off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if ((pp = page_hold(vp, start))) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); error = vn_io_fault_uiomove(va + off, bytes, GET_UIO_STRUCT(uio)); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unhold(pp); } else { zfs_vmobject_wunlock_12(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock_12(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock_12(obj); return (error); } int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *presid) { int error = 0; ssize_t resid; error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); if (error) { return (SET_ERROR(error)); } else if (presid == NULL) { if (resid != 0) { error = SET_ERROR(EIO); } } else { *presid = resid; } return (error); } void zfs_zrele_async(znode_t *zp) { vnode_t *vp = ZTOV(zp); objset_t *os = ITOZSB(vp)->z_os; VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os))); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; int error; int ltype; if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(dvp, __func__); if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (VN_IS_DOOMED(dvp)) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, int flags, boolean_t cached) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; #if __FreeBSD_version > 1300124 seqc_t dvp_seqc; #endif int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, const char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); #if __FreeBSD_version > 1300124 dvp_seqc = vn_seqc_read_notmodify(dvp); #endif *vpp = NULL; if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } *vpp = ZTOV(zp); /* * Do we have permission to get into attribute directory? */ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); if (error) { vrele(ZTOV(zp)); } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory if we're not coming in via * VOP_CACHEDLOOKUP. */ if (!cached) { #ifdef NOEXECCHECK if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; } else #endif if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK1(dvp); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } fallthrough; case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } #if __FreeBSD_version > 1300124 if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * FIXME: zfs_lookup_lock relocks vnodes and does nothing to * handle races. In particular different callers may end up * with different vnodes and will try to add conflicting * entries to the namecache. * * While finding different result may be acceptable in face * of concurrent modification, adding conflicting entries * trips over an assert in the namecache. * * Ultimately let an entry through once everything settles. */ if (!vn_seqc_consistent(dvp, dvp_seqc)) { cnp->cn_flags &= ~MAKEENTRY; } } #endif /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); uint64_t projid = ZFS_DEFAULT_PROJID; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype; #ifdef DEBUG_VFS_LOCKS vnode_t *dvp = ZTOV(dzp); #endif /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *zpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve_(); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: VNCHECKREF(dvp); if (error == 0) { *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zp = VTOZ(vp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; xattr_obj = 0; xzp = NULL; if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } /* XXX check changes to linux vnops */ txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, struct componentname *cnp, int nameiop) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; cnp->cn_nameptr = __DECONST(char *, name); cnp->cn_namelen = strlen(name); cnp->cn_nameiop = nameiop; cnp->cn_flags = ISLASTCN | SAVENAME; cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cnp->cn_cred = kcred; #if __FreeBSD_version < 1400037 cnp->cn_thread = curthread; #endif if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { struct vop_lookup_args a; a.a_gen.a_desc = &vop_lookup_desc; a.a_dvp = ZTOV(dzp); a.a_vpp = vpp; a.a_cnp = cnp; error = vfs_cache_lookup(&a); } else { error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0, B_FALSE); } #ifdef ZFS_DEBUG if (error) { printf("got error %d on name %s on op %d\n", error, name, nameiop); kdb_backtrace(); } #endif return (error); } int zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) { vnode_t *vp; int error; struct componentname cn; if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) return (error); error = zfs_remove_(ZTOV(dzp), vp, name, cr); vput(vp); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT3U(vap->va_type, ==, VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *zpp = NULL; if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } #if __FreeBSD_version < 1300124 static void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { cache_purge(dvp); cache_purge(vp); } #endif /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); cache_vop_rmdir(dvp, vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) { struct componentname cn; vnode_t *vp; int error; if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) return (error); error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); vput(vp); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, int *ncookies, ulong_t **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; ulong_t *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = zfs_uio_offset(uio); prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = GET_UIO_STRUCT(uio)->uio_iov; bytes_wanted = iovp->iov_len; if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) - sizeof (((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); /* NOTE: d_off is the offset for the *next* entry. */ next = &odp->d_off; strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; dirent_terminate(odp); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT3S(outcount, <=, bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } /* Fill the offset right after advancing the cursor. */ if (next != NULL) *next = offset; if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; zfs_uio_resid(uio) -= outcount; } else if ((error = zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) { /* * Reset the pointer. */ offset = zfs_uio_offset(uio); } update: zap_cursor_fini(&zc); if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_uio_setoffset(uio, offset); ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr))) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; vn_fsid(vp, vap); vap->va_nodeid = zp->z_id; vap->va_nlink = zp->z_links; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && zp->z_links < ZFS_LINK_MAX) vap->va_nlink++; vap->va_size = zp->z_size; if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { xoap->xoa_projinherit = ((zp->z_pflags & ZFS_PROJINHERIT) != 0); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { xoap->xoa_projid = zp->z_projid; XVA_SET_RTN(xvap, XAT_PROJID); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Note: ZFS_READONLY is handled in zfs_zaccess_common. */ /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap != NULL && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } } attrzp = NULL; aclp = NULL; if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } tx = dmu_tx_create(os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT0(err); if (attrzp) { vn_seqc_write_begin(ZTOV(attrzp)); err = zfs_acl_chown_setattr(attrzp); vn_seqc_write_end(ZTOV(attrzp)); ASSERT0(err); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT3S(vp->v_type, ==, VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT0(err2); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK1(tdvp); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK1(*tvpp); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK1(tdvp); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK1(nvp); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); VOP_UNLOCK1(*svpp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } #if __FreeBSD_version < 1300124 static void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { cache_purge(fvp); if (tvp != NULL) cache_purge(tvp); cache_purge_negative(tdvp); } #endif /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr, int log) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error = 0; bool want_seqc_end __maybe_unused = false; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_check(szp, sdzp, tdzp))) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vn_seqc_write_begin(*svpp); vn_seqc_write_begin(sdvp); if (*tvpp != NULL) vn_seqc_write_begin(*tvpp); if (tdvp != *tvpp) vn_seqc_write_begin(tdvp); #if __FreeBSD_version >= 1300102 want_seqc_end = true; #endif vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL)); } } if (error == 0) { cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ if (want_seqc_end) { vn_seqc_write_end(*svpp); vn_seqc_write_end(sdvp); if (*tvpp != NULL) vn_seqc_write_end(*tvpp); if (tdvp != *tvpp) vn_seqc_write_end(tdvp); want_seqc_end = false; } VOP_UNLOCK1(*svpp); VOP_UNLOCK1(sdvp); if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); out: /* original two vnodes are locked */ MPASS(!want_seqc_end); if (*tvpp != NULL) VOP_UNLOCK1(*tvpp); if (tdvp != *tvpp) VOP_UNLOCK1(tdvp); return (error); } int zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, cred_t *cr, int flags) { struct componentname scn, tcn; vnode_t *sdvp, *tdvp; vnode_t *svp, *tvp; int error; svp = tvp = NULL; sdvp = ZTOV(sdzp); tdvp = ZTOV(tdzp); error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); if (sdzp->z_zfsvfs->z_replay == B_FALSE) VOP_UNLOCK1(sdvp); if (error != 0) goto fail; VOP_UNLOCK1(svp); vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK1(tdvp); goto fail; } error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, const char *link, znode_t **zpp, cred_t *cr, int flags) { znode_t *zp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; ASSERT3S(vap->va_type, ==, VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), __DECONST(void *, link), len, tx); else zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *zpp = zp; zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ int zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, int flags) { znode_t *tzp; zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (ZTOV(szp)->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(tdzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(ZTOV(szp), ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: ip - inode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ static void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); } CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid)); CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; fidp->fid_len = size; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp; zfsvfs_t *zfsvfs; switch (cmd) { case _PC_LINK_MAX: *valp = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); case _PC_ACL_EXTENDED: #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0; ZFS_EXIT(zfsvfs); #else *valp = 0; #endif return (0); case _PC_ACL_NFS4: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0; ZFS_EXIT(zfsvfs); return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; vm_object_t object; off_t start, end, obj_size; uint_t blksz; int pgsin_b, pgsin_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); /* * Lock a range covering all required and optional pages. * Note that we need to handle the case of the block size growing. */ for (;;) { blksz = zp->z_blksz; lr = zfs_rangelock_tryenter(&zp->z_rangelock, rounddown(start, blksz), roundup(end, blksz) - rounddown(start, blksz), RL_READER); if (lr == NULL) { if (rahead != NULL) { *rahead = 0; rahead = NULL; } if (rbehind != NULL) { *rbehind = 0; rbehind = NULL; } break; } if (blksz == zp->z_blksz) break; zfs_rangelock_exit(lr); } object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { if (lr != NULL) zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } pgsin_b = 0; if (rbehind != NULL) { pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); pgsin_b = MIN(*rbehind, pgsin_b); } pgsin_a = 0; if (rahead != NULL) { pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); if (end + IDX_TO_OFF(pgsin_a) >= obj_size) pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); pgsin_a = MIN(*rahead, pgsin_a); } /* * NB: we need to pass the exact byte size of the data that we expect * to read after accounting for the file size. This is required because * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); if (lr != NULL) zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error != 0) return (zfs_vm_pagerret_error); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); if (rbehind != NULL) *rbehind = pgsin_b; if (rahead != NULL) *rahead = pgsin_a; return (zfs_vm_pagerret_ok); } #ifndef _SYS_SYSPROTO_H_ struct vop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; }; #endif static int zfs_freebsd_getpages(struct vop_getpages_args *ap) { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); /* * XXX we should be passing a callback to undirty * but that would make the locking messier */ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0, NULL, NULL); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_rangelock_exit(lr); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } #ifndef _SYS_SYSPROTO_H_ struct vop_putpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; }; #endif static int zfs_freebsd_putpages(struct vop_putpages_args *ap) { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } #ifndef _SYS_SYSPROTO_H_ struct vop_bmap_args { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; }; #endif static int zfs_freebsd_bmap(struct vop_bmap_args *ap) { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_open(struct vop_open_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_close(struct vop_close_args *ap) { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_ioctl_args { struct vnode *a_vp; ulong_t a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_ioctl(struct vop_ioctl_args *ap) { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL)); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif static int zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; zfs_uio_init(&uio, ap->a_uio); return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), ap->a_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif static int zfs_freebsd_write(struct vop_write_args *ap) { zfs_uio_t uio; zfs_uio_init(&uio, ap->a_uio); return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), ap->a_cred)); } #if __FreeBSD_version >= 1300102 /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ static int zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v) { vnode_t *vp; znode_t *zp; uint64_t pflags; vp = v->a_vp; zp = VTOZ_SMR(vp); if (__predict_false(zp == NULL)) return (EAGAIN); pflags = atomic_load_64(&zp->z_pflags); if (pflags & ZFS_AV_QUARANTINED) return (EAGAIN); if (pflags & ZFS_XATTR) return (EAGAIN); if ((pflags & ZFS_NO_EXECS_DENIED) == 0) return (EAGAIN); return (0); } #endif #if __FreeBSD_version >= 1300139 static int zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v) { vnode_t *vp; znode_t *zp; char *target; vp = v->a_vp; zp = VTOZ_SMR(vp); if (__predict_false(zp == NULL)) { return (EAGAIN); } target = atomic_load_consume_ptr(&zp->z_cached_symlink); if (target == NULL) { return (EAGAIN); } return (cache_symlink_resolve(v->a_fpl, target, strlen(target))); } #endif #ifndef _SYS_SYSPROTO_H_ struct vop_access_args { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_access(struct vop_access_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; if (ap->a_accmode == VEXEC) { if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) return (0); } /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(zp, accmode, 0, ap->a_cred); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { #if __FreeBSD_version >= 1300105 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred); #else error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); #endif } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_lookup_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT3U(cnp->cn_namelen, <, sizeof (nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, 0, cached)); } static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) { return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); } #ifndef _SYS_SYSPROTO_H_ struct vop_lookup_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif static int zfs_cache_lookup(struct vop_lookup_args *ap) { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); } #ifndef _SYS_SYSPROTO_H_ struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif static int zfs_freebsd_create(struct vop_create_args *ap) { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); if (rc == 0) *ap->a_vpp = ZTOV(zp); if (zfsvfs->z_use_namecache && rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_remove(struct vop_remove_args *ap) { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif static int zfs_freebsd_mkdir(struct vop_mkdir_args *ap) { vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); *ap->a_vpp = NULL; rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, ap->a_cnp->cn_cred, 0, NULL); if (rc == 0) *ap->a_vpp = ZTOV(zp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_rmdir(struct vop_rmdir_args *ap) { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; ulong_t **a_cookies; }; #endif static int zfs_freebsd_readdir(struct vop_readdir_args *ap) { zfs_uio_t uio; zfs_uio_init(&uio, ap->a_uio); return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } #ifndef _SYS_SYSPROTO_H_ struct vop_fsync_args { struct vnode *a_vp; int a_waitfor; struct thread *a_td; }; #endif static int zfs_freebsd_fsync(struct vop_fsync_args *ap) { vop_stdfsync(ap); return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif static int zfs_freebsd_getattr(struct vop_getattr_args *ap) { vattr_t *vap = ap->a_vap; xvattr_t xvap; ulong_t fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif static int zfs_freebsd_setattr(struct vop_setattr_args *ap) { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; ulong_t fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on * objects they have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; #endif static int zfs_freebsd_rename(struct vop_rename_args *ap) { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred, 1); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; #endif static int zfs_freebsd_symlink(struct vop_symlink_args *ap) { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; znode_t *zp = NULL; #if __FreeBSD_version >= 1300139 char *symlink; size_t symlink_len; #endif int rc; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); *ap->a_vpp = NULL; rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); if (rc == 0) { *ap->a_vpp = ZTOV(zp); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); #if __FreeBSD_version >= 1300139 MPASS(zp->z_cached_symlink == NULL); symlink_len = strlen(ap->a_target); symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); if (symlink != NULL) { memcpy(symlink, ap->a_target, symlink_len); symlink[symlink_len] = '\0'; atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)symlink); } #endif } return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; #endif static int zfs_freebsd_readlink(struct vop_readlink_args *ap) { zfs_uio_t uio; int error; #if __FreeBSD_version >= 1300139 znode_t *zp = VTOZ(ap->a_vp); char *symlink, *base; size_t symlink_len; bool trycache; #endif zfs_uio_init(&uio, ap->a_uio); #if __FreeBSD_version >= 1300139 trycache = false; if (zfs_uio_segflg(&uio) == UIO_SYSSPACE && zfs_uio_iovcnt(&uio) == 1) { base = zfs_uio_iovbase(&uio, 0); symlink_len = zfs_uio_iovlen(&uio, 0); trycache = true; } #endif error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL); #if __FreeBSD_version >= 1300139 if (atomic_load_ptr(&zp->z_cached_symlink) != NULL || error != 0 || !trycache) { return (error); } symlink_len -= zfs_uio_resid(&uio); symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); if (symlink != NULL) { memcpy(symlink, base, symlink_len); symlink[symlink_len] = '\0'; if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL, (uintptr_t)symlink)) { cache_symlink_free(symlink, symlink_len + 1); } } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_link(struct vop_link_args *ap) { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(VTOZ(tdvp), VTOZ(vp), cnp->cn_nameptr, cnp->cn_cred, 0)); } #ifndef _SYS_SYSPROTO_H_ struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_inactive(struct vop_inactive_args *ap) { vnode_t *vp = ap->a_vp; #if __FreeBSD_version >= 1300123 zfs_inactive(vp, curthread->td_ucred, NULL); #else zfs_inactive(vp, ap->a_td->td_ucred, NULL); #endif return (0); } #if __FreeBSD_version >= 1300042 #ifndef _SYS_SYSPROTO_H_ struct vop_need_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int need; if (vn_need_pageq_flush(vp)) return (1); if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs)) return (1); need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); return (need); } #endif #ifndef _SYS_SYSPROTO_H_ struct vop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_reclaim(struct vop_reclaim_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT3P(zp, !=, NULL); #if __FreeBSD_version < 1300042 /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); #endif /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vp->v_data = NULL; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_fid_args { struct vnode *a_vp; struct fid *a_fid; }; #endif static int zfs_freebsd_fid(struct vop_fid_args *ap) { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct vop_pathconf_args { struct vnode *a_vp; int a_name; register_t *a_retval; } *ap; #endif static int zfs_freebsd_pathconf(struct vop_pathconf_args *ap) { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) { *ap->a_retval = val; return (error); } if (error != EOPNOTSUPP) return (error); switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); #if __FreeBSD_version >= 1400032 case _PC_DEALLOC_PRESENT: *ap->a_retval = 1; return (0); #endif case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (SET_ERROR(EINVAL)); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (SET_ERROR(EINVAL)); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (SET_ERROR(EINVAL)); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (SET_ERROR(ENAMETOOLONG)); } return (0); } static int zfs_ensure_xattr_cached(znode_t *zp) { int error = 0; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); if (zp->z_xattr_cached != NULL) return (0); if (rw_write_held(&zp->z_xattr_lock)) return (zfs_sa_get_xattr(zp)); if (!rw_tryupgrade(&zp->z_xattr_lock)) { rw_exit(&zp->z_xattr_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); } if (zp->z_xattr_cached == NULL) error = zfs_sa_get_xattr(zp); rw_downgrade(&zp->z_xattr_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) { struct thread *td = ap->a_td; struct nameidata nd; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR, B_FALSE); if (error != 0) return (error); flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) return (SET_ERROR(error)); if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK1(vp); vn_close(vp, flags, ap->a_cred, td); return (error); } static int zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname) { znode_t *zp = VTOZ(ap->a_vp); uchar_t *nv_value; uint_t nv_size; int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname, &nv_value, &nv_size); if (error != 0) return (SET_ERROR(error)); if (ap->a_size != NULL) *ap->a_size = nv_size; else if (ap->a_uio != NULL) error = uiomove(nv_value, nv_size, ap->a_uio); if (error != 0) return (SET_ERROR(error)); return (0); } /* * Vnode operation to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrname[EXTATTR_MAXNAMELEN+1]; int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); if (error != 0) return (error); error = ENOENT; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp) rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) error = zfs_getextattr_sa(ap, attrname); if (error == ENOENT) error = zfs_getextattr_dir(ap, attrname); rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); if (error == ENOENT) error = SET_ERROR(ENOATTR); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) { struct nameidata nd; vnode_t *xvp = NULL, *vp; int error; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR, B_FALSE); if (error != 0) return (error); NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); return (SET_ERROR(error)); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); return (error); } static int zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname) { znode_t *zp = VTOZ(ap->a_vp); nvlist_t *nvl; int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); nvl = zp->z_xattr_cached; error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY); if (error != 0) error = SET_ERROR(error); else error = zfs_sa_set_xattr(zp); if (error != 0) { zp->z_xattr_cached = NULL; nvlist_free(nvl); } return (error); } /* * Vnode operation to remove a named attribute. */ static int zfs_deleteextattr(struct vop_deleteextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrname[EXTATTR_MAXNAMELEN+1]; int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); if (error != 0) return (error); size_t size = 0; struct vop_getextattr_args vga = { .a_vp = ap->a_vp, .a_size = &size, .a_cred = ap->a_cred, .a_td = ap->a_td, }; error = ENOENT; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zfs_getextattr_sa(&vga, attrname); if (error == 0) error = zfs_deleteextattr_sa(ap, attrname); } if (error == ENOENT) { error = zfs_getextattr_dir(&vga, attrname); if (error == 0) error = zfs_deleteextattr_dir(ap, attrname); } rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); if (error == ENOENT) error = SET_ERROR(ENOATTR); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) { struct thread *td = ap->a_td; struct nameidata nd; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); if (error != 0) return (error); flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) return (SET_ERROR(error)); VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK1(vp); vn_close(vp, flags, ap->a_cred, td); return (error); } static int zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname) { znode_t *zp = VTOZ(ap->a_vp); nvlist_t *nvl; size_t sa_size; int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); nvl = zp->z_xattr_cached; size_t entry_size = ap->a_uio->uio_resid; if (entry_size > DXATTR_MAX_ENTRY_SIZE) return (SET_ERROR(EFBIG)); error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error != 0) return (SET_ERROR(error)); if (sa_size > DXATTR_MAX_SA_SIZE) return (SET_ERROR(EFBIG)); uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP); error = uiomove(buf, entry_size, ap->a_uio); if (error != 0) { error = SET_ERROR(error); } else { error = nvlist_add_byte_array(nvl, attrname, buf, entry_size); if (error != 0) error = SET_ERROR(error); } kmem_free(buf, entry_size); if (error == 0) error = zfs_sa_set_xattr(zp); if (error != 0) { zp->z_xattr_cached = NULL; nvlist_free(nvl); } return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrname[EXTATTR_MAXNAMELEN+1]; int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); if (error != 0) return (error); struct vop_deleteextattr_args vda = { .a_vp = ap->a_vp, .a_cred = ap->a_cred, .a_td = ap->a_td, }; error = ENOENT; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) { error = zfs_setextattr_sa(ap, attrname); if (error == 0) /* * Successfully put into SA, we need to clear the one * in dir if present. */ zfs_deleteextattr_dir(&vda, attrname); } if (error) { error = zfs_setextattr_dir(ap, attrname); if (error == 0 && zp->z_is_sa) /* * Successfully put into dir, we need to clear the one * in SA if present. */ zfs_deleteextattr_sa(&vda, attrname); } rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) { struct thread *td = ap->a_td; struct nameidata nd; uint8_t dirbuf[sizeof (struct dirent)]; struct iovec aiov; struct uio auio; vnode_t *xvp = NULL, *vp; int error, eof; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR, B_FALSE); if (error != 0) { /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) return (SET_ERROR(error)); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; size_t plen = strlen(attrprefix); do { aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof (dirbuf); auio.uio_resid = sizeof (dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); if (error != 0) break; int done = sizeof (dirbuf) - auio.uio_resid; for (int pos = 0; pos < done; ) { struct dirent *dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; else if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; uint8_t nlen = dp->d_namlen - plen; if (ap->a_size != NULL) { *ap->a_size += 1 + nlen; } else if (ap->a_uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, ap->a_uio); if (error == 0) { char *namep = dp->d_name + plen; error = uiomove(namep, nlen, ap->a_uio); } if (error != 0) { error = SET_ERROR(error); break; } } } } while (!eof && error == 0); vput(vp); return (error); } static int zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix) { znode_t *zp = VTOZ(ap->a_vp); int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); size_t plen = strlen(attrprefix); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); const char *name = nvpair_name(nvp); if (plen == 0 && strncmp(name, "freebsd:", 8) == 0) continue; else if (strncmp(name, attrprefix, plen) != 0) continue; uint8_t nlen = strlen(name) - plen; if (ap->a_size != NULL) { *ap->a_size += 1 + nlen; } else if (ap->a_uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, ap->a_uio); if (error == 0) { char *namep = __DECONST(char *, name) + plen; error = uiomove(namep, nlen, ap->a_uio); } if (error != 0) { error = SET_ERROR(error); break; } } } return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrprefix[16]; int error; if (ap->a_size != NULL) *ap->a_size = 0; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof (attrprefix)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) error = zfs_listextattr_sa(ap, attrprefix); if (error == 0) error = zfs_listextattr_dir(ap, attrprefix); rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_getacl_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_getacl(struct vop_getacl_args *ap) { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if ((error = zfs_getsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred))) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_setacl_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_setacl(struct vop_setacl_args *ap) { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); kmem_free(aaclp, aclbsize); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_aclcheck_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; #if __FreeBSD_version >= 1300045 enum vgetstate vs = vget_prep(covered_vp); #else vhold(covered_vp); #endif ltype = VOP_ISLOCKED(vp); VOP_UNLOCK1(vp); #if __FreeBSD_version >= 1300045 error = vget_finish(covered_vp, LK_SHARED, vs); #else error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); #endif if (error == 0) { #if __FreeBSD_version >= 1300123 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, ap->a_buflen); #else error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); #endif vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if (VN_IS_DOOMED(vp)) error = SET_ERROR(ENOENT); return (error); } #if __FreeBSD_version >= 1400032 static int zfs_deallocate(struct vop_deallocate_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; off_t off, len, file_sz; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } zilog = zfsvfs->z_log; off = *ap->a_offset; len = *ap->a_len; file_sz = zp->z_size; if (off + len > file_sz) len = file_sz - off; /* Fast path for out-of-range request. */ if (len <= 0) { *ap->a_len = 0; ZFS_EXIT(zfsvfs); return (0); } error = zfs_freesp(zp, off, len, O_RDWR, TRUE); if (error == 0) { if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS || (ap->a_ioflag & IO_SYNC) != 0) zil_commit(zilog, zp->z_id); *ap->a_offset = off + len; *ap->a_len = 0; } ZFS_EXIT(zfsvfs); return (error); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, #if __FreeBSD_version >= 1300042 .vop_need_inactive = zfs_freebsd_need_inactive, #endif .vop_reclaim = zfs_freebsd_reclaim, #if __FreeBSD_version >= 1300102 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, #endif #if __FreeBSD_version >= 1300139 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, #endif .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, #if __FreeBSD_version >= 1400032 .vop_deallocate = zfs_deallocate, #endif .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_cachedlookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #if __FreeBSD_version >= 1300064 .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, #endif + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, #if __FreeBSD_version >= 1300102 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, #endif #if __FreeBSD_version >= 1300139 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, #endif .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(zfs_fifoops); /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, #endif #if __FreeBSD_version >= 1300139 .vop_fplookup_symlink = VOP_EAGAIN, #endif .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(zfs_shareops); diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 964d288324b4..594dc087aab3 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -1,2122 +1,2124 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*dswp == NULL || *devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } static void devfs_usecount_add(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); if (VN_IS_DOOMED(vp)) { goto out_unlock; } de = vp->v_data; dev = vp->v_rdev; MPASS(de != NULL); MPASS(dev != NULL); dev->si_usecount++; de->de_usecount++; out_unlock: VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static void devfs_usecount_subl(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); de = vp->v_data; dev = vp->v_rdev; if (de == NULL) return; if (dev == NULL) { MPASS(de->de_usecount == 0); return; } if (dev->si_usecount < de->de_usecount) panic("%s: si_usecount underflow for dev %p " "(has %ld, dirent has %d)\n", __func__, dev, dev->si_usecount, de->de_usecount); if (VN_IS_DOOMED(vp)) { dev->si_usecount -= de->de_usecount; de->de_usecount = 0; } else { if (de->de_usecount == 0) panic("%s: de_usecount underflow for dev %p\n", __func__, dev); dev->si_usecount--; de->de_usecount--; } } static void devfs_usecount_sub(struct vnode *vp) { mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static int devfs_usecountl(struct vnode *vp) { VNPASS(vp->v_type == VCHR, vp); mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); return (vp->v_rdev->si_usecount); } int devfs_usecount(struct vnode *vp) { int count; VNPASS(vp->v_type == VCHR, vp); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); count = devfs_usecountl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); return (count); } void devfs_ctty_ref(struct vnode *vp) { vrefact(vp); devfs_usecount_add(vp); } void devfs_ctty_unref(struct vnode *vp) { devfs_usecount_sub(vp); vrele(vp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); if (!devfs_populate_needed(dmp)) { sx_xlock(&dmp->dm_lock); goto out_nopopulate; } locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } out_nopopulate: if (VN_IS_DOOMED(vp)) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); if (vp->v_type != VCHR && vp->v_type != VDIR) { error = ENOENT; goto finished; } dd = vp->v_data; if (vp->v_type == VDIR && dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i = *buflen; i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); *buflen = i; de = devfs_parent_dirent(dd); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; enum vgetstate vs; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { vs = vget_prep(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget_finish(vp, lockmode | LK_RETRY, vs); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if (VN_IS_DOOMED(vp)) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; VNPASS(vp->v_usecount == 1, vp); /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; struct devfs_dirent *de = vp->v_data; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque1() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (de->de_usecount == 2 && td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 2 && !VN_IS_DOOMED(vp)) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) devfs_ctty_unref(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 1) dflags |= FLASTCLOSE; devfs_usecount_subl(vp); mtx_unlock(&devfs_de_interlock); if (VN_IS_DOOMED(vp)) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if ((dflags & FLASTCLOSE) == 0) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } vholdnz(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct session *sess; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* * Do nothing if reassigning same control tty, or if the * control tty has already disappeared. If it disappeared, * it's because we were racing with TIOCNOTTY. TIOCNOTTY * already took care of releasing the old vnode and we have * nothing left to do. */ sx_slock(&proctree_lock); sess = td->td_proc->p_session; if (sess->s_ttyvp == vp || sess->s_ttyp == NULL) { sx_sunlock(&proctree_lock); return (0); } devfs_ctty_ref(vp); SESS_LOCK(sess); vpold = sess->s_ttyvp; sess->s_ttyvp = vp; sess->s_ttydp = cdev2priv(dev); SESS_UNLOCK(sess); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) devfs_ctty_unref(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } if (vp->v_type == VCHR) devfs_usecount_add(vp); vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); if (error != 0 && vp->v_type == VCHR) devfs_usecount_sub(vp); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_R); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static void devfs_reclaiml(struct vnode *vp) { struct devfs_dirent *de; mtx_assert(&devfs_de_interlock, MA_OWNED); de = vp->v_data; if (de != NULL) { MPASS(de->de_usecount == 0); de->de_vnode = NULL; vp->v_data = NULL; } } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp); if (dvp != vp) VOP_UNLOCK(dvp); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; enum vgetstate vs; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); vs = vget_prep(vp2); mtx_unlock(&devfs_de_interlock); if (vget_finish(vp2, LK_EXCLUSIVE, vs) != 0) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred) { return (vnops.fo_stat(fp, sb, cred)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_W); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(devfs_vnodeops); /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(devfs_specops); /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); diff --git a/sys/fs/pseudofs/pseudofs_vnops.c b/sys/fs/pseudofs/pseudofs_vnops.c index 29bb1544e7ad..eae4c1c71ab9 100644 --- a/sys/fs/pseudofs/pseudofs_vnops.c +++ b/sys/fs/pseudofs/pseudofs_vnops.c @@ -1,1172 +1,1173 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_pseudofs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KASSERT_PN_IS_DIR(pn) \ KASSERT((pn)->pn_type == pfstype_root || \ (pn)->pn_type == pfstype_dir || \ (pn)->pn_type == pfstype_procdir, \ ("%s(): VDIR vnode refers to non-directory pfs_node", __func__)) #define KASSERT_PN_IS_FILE(pn) \ KASSERT((pn)->pn_type == pfstype_file, \ ("%s(): VREG vnode refers to non-file pfs_node", __func__)) #define KASSERT_PN_IS_LINK(pn) \ KASSERT((pn)->pn_type == pfstype_symlink, \ ("%s(): VLNK vnode refers to non-link pfs_node", __func__)) #define PFS_MAXBUFSIZ 1024 * 1024 /* * Returns the fileno, adjusted for target pid */ static uint32_t pn_fileno(struct pfs_node *pn, pid_t pid) { KASSERT(pn->pn_fileno > 0, ("%s(): no fileno allocated", __func__)); if (pid != NO_PID) return (pn->pn_fileno * NO_PID + pid); return (pn->pn_fileno); } /* * Returns non-zero if given file is visible to given thread. */ static int pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc) { int visible; if (proc == NULL) return (0); PROC_LOCK_ASSERT(proc, MA_OWNED); visible = ((proc->p_flag & P_WEXIT) == 0); if (visible) visible = (p_cansee(td, proc) == 0); if (visible && pn->pn_vis != NULL) visible = pn_vis(td, proc, pn); if (!visible) return (0); return (1); } static int pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid, struct proc **p) { struct proc *proc; PFS_TRACE(("%s (pid: %d, req: %d)", pn->pn_name, pid, td->td_proc->p_pid)); if (p) *p = NULL; if (pid == NO_PID) PFS_RETURN (1); proc = pfind(pid); if (proc == NULL) PFS_RETURN (0); if (pfs_visible_proc(td, pn, proc)) { if (p) *p = proc; else PROC_UNLOCK(proc); PFS_RETURN (1); } PROC_UNLOCK(proc); PFS_RETURN (0); } static int pfs_lookup_proc(pid_t pid, struct proc **p) { struct proc *proc; proc = pfind(pid); if (proc == NULL) return (0); if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); return (0); } _PHOLD(proc); PROC_UNLOCK(proc); *p = proc; return (1); } /* * Verify permissions */ static int pfs_access(struct vop_access_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct vattr vattr; int error; PFS_TRACE(("%s", pvd->pvd_pn->pn_name)); (void)pvd; error = VOP_GETATTR(vn, &vattr, va->a_cred); if (error) PFS_RETURN (error); error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, va->a_accmode, va->a_cred); PFS_RETURN (error); } /* * Close a file or directory */ static int pfs_close(struct vop_close_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct proc *proc; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); /* * Do nothing unless this is the last close and the node has a * last-close handler. */ if (vrefcnt(vn) > 1 || pn->pn_close == NULL) PFS_RETURN (0); if (pvd->pvd_pid != NO_PID) { proc = pfind(pvd->pvd_pid); } else { proc = NULL; } error = pn_close(va->a_td, proc, pn); if (proc != NULL) PROC_UNLOCK(proc); PFS_RETURN (error); } /* * Get file attributes */ static int pfs_getattr(struct vop_getattr_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct vattr *vap = va->a_vap; struct proc *proc; int error = 0; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (ENOENT); vap->va_type = vn->v_type; vap->va_fileid = pn_fileno(pn, pvd->pvd_pid); vap->va_flags = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_filerev = 0; vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0]; vap->va_nlink = 1; nanotime(&vap->va_ctime); vap->va_atime = vap->va_mtime = vap->va_ctime; switch (pn->pn_type) { case pfstype_procdir: case pfstype_root: case pfstype_dir: #if 0 pfs_lock(pn); /* compute link count */ pfs_unlock(pn); #endif vap->va_mode = 0555; break; case pfstype_file: case pfstype_symlink: vap->va_mode = 0444; break; default: printf("shouldn't be here!\n"); vap->va_mode = 0; break; } if (proc != NULL) { vap->va_uid = proc->p_ucred->cr_ruid; vap->va_gid = proc->p_ucred->cr_rgid; } else { vap->va_uid = 0; vap->va_gid = 0; } if (pn->pn_attr != NULL) error = pn_attr(curthread, proc, pn, vap); if(proc != NULL) PROC_UNLOCK(proc); PFS_RETURN (error); } /* * Perform an ioctl */ static int pfs_ioctl(struct vop_ioctl_args *va) { struct vnode *vn; struct pfs_vdata *pvd; struct pfs_node *pn; struct proc *proc; int error; vn = va->a_vp; vn_lock(vn, LK_SHARED | LK_RETRY); if (VN_IS_DOOMED(vn)) { VOP_UNLOCK(vn); return (EBADF); } pvd = vn->v_data; pn = pvd->pvd_pn; PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) { VOP_UNLOCK(vn); PFS_RETURN (EINVAL); } KASSERT_PN_IS_FILE(pn); if (pn->pn_ioctl == NULL) { VOP_UNLOCK(vn); PFS_RETURN (ENOTTY); } /* * This is necessary because process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) { VOP_UNLOCK(vn); PFS_RETURN (EIO); } error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data); if (proc != NULL) PROC_UNLOCK(proc); VOP_UNLOCK(vn); PFS_RETURN (error); } /* * Perform getextattr */ static int pfs_getextattr(struct vop_getextattr_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct proc *proc; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (pn->pn_getextattr == NULL) error = EOPNOTSUPP; else error = pn_getextattr(curthread, proc, pn, va->a_attrnamespace, va->a_name, va->a_uio, va->a_size, va->a_cred); if (proc != NULL) PROC_UNLOCK(proc); PFS_RETURN (error); } /* * Convert a vnode to its component name */ static int pfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct pfs_vdata *pvd = vp->v_data; struct pfs_node *pd = pvd->pvd_pn; struct pfs_node *pn; struct mount *mp; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; char pidbuf[PFS_NAMELEN]; pid_t pid = pvd->pvd_pid; int len, i, error, locked; i = *buflen; error = 0; pfs_lock(pd); if (vp->v_type == VDIR && pd->pn_type == pfstype_root) { *dvp = vp; vhold(*dvp); pfs_unlock(pd); PFS_RETURN (0); } else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) { len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid); i -= len; if (i < 0) { error = ENOMEM; goto failed; } bcopy(pidbuf, buf + i, len); } else { len = strlen(pd->pn_name); i -= len; if (i < 0) { error = ENOMEM; goto failed; } bcopy(pd->pn_name, buf + i, len); } pn = pd->pn_parent; pfs_unlock(pd); mp = vp->v_mount; error = vfs_busy(mp, 0); if (error) return (error); /* * vp is held by caller. */ locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); error = pfs_vncache_alloc(mp, dvp, pn, pid); if (error) { vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); PFS_RETURN(error); } *buflen = i; VOP_UNLOCK(*dvp); vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); PFS_RETURN (0); failed: pfs_unlock(pd); PFS_RETURN(error); } /* * Look up a file or directory */ static int pfs_lookup(struct vop_cachedlookup_args *va) { struct vnode *vn = va->a_dvp; struct vnode **vpp = va->a_vpp; struct componentname *cnp = va->a_cnp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pd = pvd->pvd_pn; struct pfs_node *pn, *pdn = NULL; struct mount *mp; pid_t pid = pvd->pvd_pid; char *pname; int error, i, namelen, visible; PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr)); pfs_assert_not_owned(pd); if (vn->v_type != VDIR) PFS_RETURN (ENOTDIR); KASSERT_PN_IS_DIR(pd); /* * Don't support DELETE or RENAME. CREATE is supported so * that O_CREAT will work, but the lookup will still fail if * the file does not exist. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) PFS_RETURN (EOPNOTSUPP); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= PFS_NAMELEN) PFS_RETURN (ENOENT); /* check that parent directory is visible... */ if (!pfs_visible(curthread, pd, pvd->pvd_pid, NULL)) PFS_RETURN (ENOENT); /* self */ namelen = cnp->cn_namelen; pname = cnp->cn_nameptr; if (namelen == 1 && pname[0] == '.') { pn = pd; *vpp = vn; VREF(vn); PFS_RETURN (0); } mp = vn->v_mount; /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (pd->pn_type == pfstype_root) PFS_RETURN (EIO); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vn); error = vfs_busy(mp, 0); vn_lock(vn, LK_EXCLUSIVE | LK_RETRY); vfs_rel(mp); if (error != 0) PFS_RETURN(ENOENT); if (VN_IS_DOOMED(vn)) { vfs_unbusy(mp); PFS_RETURN(ENOENT); } } VOP_UNLOCK(vn); KASSERT(pd->pn_parent != NULL, ("%s(): non-root directory has no parent", __func__)); /* * This one is tricky. Descendents of procdir nodes * inherit their parent's process affinity, but * there's no easy reverse mapping. For simplicity, * we assume that if this node is a procdir, its * parent isn't (which is correct as long as * descendents of procdir nodes are never procdir * nodes themselves) */ if (pd->pn_type == pfstype_procdir) pid = NO_PID; pfs_lock(pd); pn = pd->pn_parent; pfs_unlock(pd); goto got_pnode; } pfs_lock(pd); /* named node */ for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next) if (pn->pn_type == pfstype_procdir) pdn = pn; else if (pn->pn_name[namelen] == '\0' && bcmp(pname, pn->pn_name, namelen) == 0) { pfs_unlock(pd); goto got_pnode; } /* process dependent node */ if ((pn = pdn) != NULL) { pid = 0; for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i) if ((pid = pid * 10 + pname[i] - '0') > PID_MAX) break; if (i == cnp->cn_namelen) { pfs_unlock(pd); goto got_pnode; } } pfs_unlock(pd); PFS_RETURN (ENOENT); got_pnode: pfs_assert_not_owned(pd); pfs_assert_not_owned(pn); visible = pfs_visible(curthread, pn, pid, NULL); if (!visible) { error = ENOENT; goto failed; } error = pfs_vncache_alloc(mp, vpp, pn, pid); if (error) goto failed; if (cnp->cn_flags & ISDOTDOT) { vfs_unbusy(mp); vn_lock(vn, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vn)) { vput(*vpp); *vpp = NULL; PFS_RETURN(ENOENT); } } if (cnp->cn_flags & MAKEENTRY && !VN_IS_DOOMED(vn)) cache_enter(vn, *vpp, cnp); PFS_RETURN (0); failed: if (cnp->cn_flags & ISDOTDOT) { vfs_unbusy(mp); vn_lock(vn, LK_EXCLUSIVE | LK_RETRY); *vpp = NULL; } PFS_RETURN(error); } /* * Open a file or directory. */ static int pfs_open(struct vop_open_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; int mode = va->a_mode; PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode)); pfs_assert_not_owned(pn); /* check if the requested mode is permitted */ if (((mode & FREAD) && !(mode & PFS_RD)) || ((mode & FWRITE) && !(mode & PFS_WR))) PFS_RETURN (EPERM); /* we don't support locking */ if ((mode & O_SHLOCK) || (mode & O_EXLOCK)) PFS_RETURN (EOPNOTSUPP); PFS_RETURN (0); } struct sbuf_seek_helper { off_t skip_bytes; struct uio *uio; }; static int pfs_sbuf_uio_drain(void *arg, const char *data, int len) { struct sbuf_seek_helper *ssh; struct uio *uio; int error, skipped; ssh = arg; uio = ssh->uio; skipped = 0; /* Need to discard first uio_offset bytes. */ if (ssh->skip_bytes > 0) { if (ssh->skip_bytes >= len) { ssh->skip_bytes -= len; return (len); } data += ssh->skip_bytes; len -= ssh->skip_bytes; skipped = ssh->skip_bytes; ssh->skip_bytes = 0; } error = uiomove(__DECONST(void *, data), len, uio); if (error != 0) return (-error); /* * The fill function has more to emit, but the reader is finished. * This is similar to the truncated read case for non-draining PFS * sbufs, and should be handled appropriately in fill-routines. */ if (uio->uio_resid == 0) return (-ENOBUFS); return (skipped + len); } /* * Read from a file */ static int pfs_read(struct vop_read_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc; struct sbuf *sb = NULL; int error, locked; off_t buflen, buflim; struct sbuf_seek_helper ssh; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); KASSERT_PN_IS_FILE(pn); if (!(pn->pn_flags & PFS_RD)) PFS_RETURN (EBADF); if (pn->pn_fill == NULL) PFS_RETURN (EIO); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (proc != NULL) { _PHOLD(proc); PROC_UNLOCK(proc); } vhold(vn); locked = VOP_ISLOCKED(vn); VOP_UNLOCK(vn); if (pn->pn_flags & PFS_RAWRD) { PFS_TRACE(("%zd resid", uio->uio_resid)); error = pn_fill(curthread, proc, pn, NULL, uio); PFS_TRACE(("%zd resid", uio->uio_resid)); goto ret; } if (uio->uio_resid < 0 || uio->uio_offset < 0 || uio->uio_resid > OFF_MAX - uio->uio_offset) { error = EINVAL; goto ret; } buflen = uio->uio_offset + uio->uio_resid + 1; if (pn->pn_flags & PFS_AUTODRAIN) /* * We can use a smaller buffer if we can stream output to the * consumer. */ buflim = PAGE_SIZE; else buflim = PFS_MAXBUFSIZ; if (buflen > buflim) buflen = buflim; sb = sbuf_new(sb, NULL, buflen, 0); if (sb == NULL) { error = EIO; goto ret; } if (pn->pn_flags & PFS_AUTODRAIN) { ssh.skip_bytes = uio->uio_offset; ssh.uio = uio; sbuf_set_drain(sb, pfs_sbuf_uio_drain, &ssh); } error = pn_fill(curthread, proc, pn, sb, uio); if (error) { sbuf_delete(sb); goto ret; } /* * XXX: If the buffer overflowed, sbuf_len() will not return * the data length. Then just use the full length because an * overflowed sbuf must be full. */ error = sbuf_finish(sb); if ((pn->pn_flags & PFS_AUTODRAIN)) { /* * ENOBUFS just indicates early termination of the fill * function as the caller's buffer was already filled. Squash * to zero. */ if (uio->uio_resid == 0 && error == ENOBUFS) error = 0; } else { if (error == 0) buflen = sbuf_len(sb); else /* The trailing byte is not valid. */ buflen--; error = uiomove_frombuf(sbuf_data(sb), buflen, uio); } sbuf_delete(sb); ret: vn_lock(vn, locked | LK_RETRY); vdrop(vn); if (proc != NULL) PRELE(proc); PFS_RETURN (error); } /* * Iterate through directory entries */ static int pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd, struct pfs_node **pn, struct proc **p) { int visible; sx_assert(&allproc_lock, SX_SLOCKED); pfs_assert_owned(pd); again: if (*pn == NULL) { /* first node */ *pn = pd->pn_nodes; } else if ((*pn)->pn_type != pfstype_procdir) { /* next node */ *pn = (*pn)->pn_next; } if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) { /* next process */ if (*p == NULL) *p = LIST_FIRST(&allproc); else *p = LIST_NEXT(*p, p_list); /* out of processes: next node */ if (*p == NULL) *pn = (*pn)->pn_next; else PROC_LOCK(*p); } if ((*pn) == NULL) return (-1); if (*p != NULL) { visible = pfs_visible_proc(td, *pn, *p); PROC_UNLOCK(*p); } else if (proc != NULL) { visible = pfs_visible_proc(td, *pn, proc); } else { visible = 1; } if (!visible) goto again; return (0); } /* Directory entry list */ struct pfsentry { STAILQ_ENTRY(pfsentry) link; struct dirent entry; }; STAILQ_HEAD(pfsdirentlist, pfsentry); /* * Return directory entries. */ static int pfs_readdir(struct vop_readdir_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pd = pvd->pvd_pn; pid_t pid = pvd->pvd_pid; struct proc *p, *proc; struct pfs_node *pn; struct uio *uio; struct pfsentry *pfsent, *pfsent2; struct pfsdirentlist lst; off_t offset; int error, i, resid; STAILQ_INIT(&lst); error = 0; KASSERT(pd->pn_info == vn->v_mount->mnt_data, ("%s(): pn_info does not match mountpoint", __func__)); PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid)); pfs_assert_not_owned(pd); if (vn->v_type != VDIR) PFS_RETURN (ENOTDIR); KASSERT_PN_IS_DIR(pd); uio = va->a_uio; /* only allow reading entire entries */ offset = uio->uio_offset; resid = uio->uio_resid; if (offset < 0 || offset % PFS_DELEN != 0 || (resid && resid < PFS_DELEN)) PFS_RETURN (EINVAL); if (resid == 0) PFS_RETURN (0); proc = NULL; if (pid != NO_PID && !pfs_lookup_proc(pid, &proc)) PFS_RETURN (ENOENT); sx_slock(&allproc_lock); pfs_lock(pd); KASSERT(pid == NO_PID || proc != NULL, ("%s(): no process for pid %lu", __func__, (unsigned long)pid)); if (pid != NO_PID) { PROC_LOCK(proc); /* check if the directory is visible to the caller */ if (!pfs_visible_proc(curthread, pd, proc)) { _PRELE(proc); PROC_UNLOCK(proc); sx_sunlock(&allproc_lock); pfs_unlock(pd); PFS_RETURN (ENOENT); } } /* skip unwanted entries */ for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) { if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) { /* nothing left... */ if (proc != NULL) { _PRELE(proc); PROC_UNLOCK(proc); } pfs_unlock(pd); sx_sunlock(&allproc_lock); PFS_RETURN (0); } } /* fill in entries */ while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 && resid >= PFS_DELEN) { if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV, M_NOWAIT | M_ZERO)) == NULL) { error = ENOMEM; break; } pfsent->entry.d_reclen = PFS_DELEN; pfsent->entry.d_fileno = pn_fileno(pn, pid); /* PFS_DELEN was picked to fit PFS_NAMLEN */ for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i) pfsent->entry.d_name[i] = pn->pn_name[i]; pfsent->entry.d_namlen = i; /* NOTE: d_off is the offset of the *next* entry. */ pfsent->entry.d_off = offset + PFS_DELEN; switch (pn->pn_type) { case pfstype_procdir: KASSERT(p != NULL, ("reached procdir node with p == NULL")); pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name, PFS_NAMELEN, "%d", p->p_pid); /* fall through */ case pfstype_root: case pfstype_dir: case pfstype_this: case pfstype_parent: pfsent->entry.d_type = DT_DIR; break; case pfstype_file: pfsent->entry.d_type = DT_REG; break; case pfstype_symlink: pfsent->entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type); } PFS_TRACE(("%s", pfsent->entry.d_name)); dirent_terminate(&pfsent->entry); STAILQ_INSERT_TAIL(&lst, pfsent, link); offset += PFS_DELEN; resid -= PFS_DELEN; } if (proc != NULL) { _PRELE(proc); PROC_UNLOCK(proc); } pfs_unlock(pd); sx_sunlock(&allproc_lock); i = 0; STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) { if (error == 0) error = uiomove(&pfsent->entry, PFS_DELEN, uio); free(pfsent, M_IOV); i++; } PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN))); PFS_RETURN (error); } /* * Read a symbolic link */ static int pfs_readlink(struct vop_readlink_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc = NULL; char buf[PATH_MAX]; struct sbuf sb; int error, locked; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VLNK) PFS_RETURN (EINVAL); KASSERT_PN_IS_LINK(pn); if (pn->pn_fill == NULL) PFS_RETURN (EIO); if (pvd->pvd_pid != NO_PID) { if ((proc = pfind(pvd->pvd_pid)) == NULL) PFS_RETURN (EIO); if (proc->p_flag & P_WEXIT) { PROC_UNLOCK(proc); PFS_RETURN (EIO); } _PHOLD(proc); PROC_UNLOCK(proc); } vhold(vn); locked = VOP_ISLOCKED(vn); VOP_UNLOCK(vn); /* sbuf_new() can't fail with a static buffer */ sbuf_new(&sb, buf, sizeof buf, 0); error = pn_fill(curthread, proc, pn, &sb, NULL); if (proc != NULL) PRELE(proc); vn_lock(vn, locked | LK_RETRY); vdrop(vn); if (error) { sbuf_delete(&sb); PFS_RETURN (error); } if (sbuf_finish(&sb) != 0) { sbuf_delete(&sb); PFS_RETURN (ENAMETOOLONG); } error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio); sbuf_delete(&sb); PFS_RETURN (error); } /* * Reclaim a vnode */ static int pfs_reclaim(struct vop_reclaim_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); return (pfs_vncache_free(va->a_vp)); } /* * Set attributes */ static int pfs_setattr(struct vop_setattr_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); /* Silently ignore unchangeable attributes. */ PFS_RETURN (0); } /* * Write to a file */ static int pfs_write(struct vop_write_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc; struct sbuf sb; int error; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); KASSERT_PN_IS_FILE(pn); if (!(pn->pn_flags & PFS_WR)) PFS_RETURN (EBADF); if (pn->pn_fill == NULL) PFS_RETURN (EIO); if (uio->uio_resid > PFS_MAXBUFSIZ) PFS_RETURN (EIO); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (proc != NULL) { _PHOLD(proc); PROC_UNLOCK(proc); } if (pn->pn_flags & PFS_RAWWR) { error = pn_fill(curthread, proc, pn, NULL, uio); if (proc != NULL) PRELE(proc); PFS_RETURN (error); } sbuf_uionew(&sb, uio, &error); if (error) { if (proc != NULL) PRELE(proc); PFS_RETURN (error); } error = pn_fill(curthread, proc, pn, &sb, uio); sbuf_delete(&sb); if (proc != NULL) PRELE(proc); PFS_RETURN (error); } /* * Vnode operations */ struct vop_vector pfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = pfs_access, .vop_cachedlookup = pfs_lookup, .vop_close = pfs_close, .vop_create = VOP_EOPNOTSUPP, .vop_getattr = pfs_getattr, .vop_getextattr = pfs_getextattr, .vop_ioctl = pfs_ioctl, .vop_link = VOP_EOPNOTSUPP, .vop_lookup = vfs_cache_lookup, .vop_mkdir = VOP_EOPNOTSUPP, .vop_mknod = VOP_EOPNOTSUPP, .vop_open = pfs_open, .vop_read = pfs_read, .vop_readdir = pfs_readdir, .vop_readlink = pfs_readlink, .vop_reclaim = pfs_reclaim, .vop_remove = VOP_EOPNOTSUPP, .vop_rename = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP, .vop_setattr = pfs_setattr, .vop_symlink = VOP_EOPNOTSUPP, .vop_vptocnp = pfs_vptocnp, .vop_write = pfs_write, + .vop_add_writecount = vop_stdadd_writecount_nomsync, /* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */ }; VFS_VOP_VECTOR_REGISTER(pfs_vnodeops); diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 3b498bf27a79..a59a522d85ab 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -1,1881 +1,1882 @@ /* $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_tmpfs); VFS_SMR_DECLARE; static volatile int tmpfs_rename_restarts; SYSCTL_INT(_vfs_tmpfs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &tmpfs_rename_restarts), 0, "Times rename had to restart due to lock contention"); static int tmpfs_vn_get_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (tmpfs_alloc_vp(mp, arg, lkflags, rvp)); } static int tmpfs_lookup1(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct tmpfs_dirent *de; struct tmpfs_node *dnode, *pnode; struct tmpfs_mount *tm; int error; /* Caller assumes responsibility for ensuring access (VEXEC). */ dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULLVP; /* We cannot be requesting the parent directory of the root node. */ MPASS(IMPLIES(dnode->tn_type == VDIR && dnode->tn_dir.tn_parent == dnode, !(cnp->cn_flags & ISDOTDOT))); TMPFS_ASSERT_LOCKED(dnode); if (dnode->tn_dir.tn_parent == NULL) { error = ENOENT; goto out; } if (cnp->cn_flags & ISDOTDOT) { tm = VFS_TO_TMPFS(dvp->v_mount); pnode = dnode->tn_dir.tn_parent; tmpfs_ref_node(pnode); error = vn_vget_ino_gen(dvp, tmpfs_vn_get_ino_alloc, pnode, cnp->cn_lkflags, vpp); tmpfs_free_node(tm, pnode); if (error != 0) goto out; } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { VREF(dvp); *vpp = dvp; error = 0; } else { de = tmpfs_dir_lookup(dnode, NULL, cnp); if (de != NULL && de->td_node == NULL) cnp->cn_flags |= ISWHITEOUT; if (de == NULL || de->td_node == NULL) { /* * The entry was not found in the directory. * This is OK if we are creating or renaming an * entry and are working on the last component of * the path name. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE || \ cnp->cn_nameiop == RENAME || (cnp->cn_nameiop == DELETE && cnp->cn_flags & DOWHITEOUT && cnp->cn_flags & ISWHITEOUT))) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, curthread); if (error != 0) goto out; /* * Keep the component name in the buffer for * future uses. */ cnp->cn_flags |= SAVENAME; error = EJUSTRETURN; } else error = ENOENT; } else { struct tmpfs_node *tnode; /* * The entry was found, so get its associated * tmpfs_node. */ tnode = de->td_node; /* * If we are not at the last path component and * found a non-directory or non-link entry (which * may itself be pointing to a directory), raise * an error. */ if ((tnode->tn_type != VDIR && tnode->tn_type != VLNK) && !(cnp->cn_flags & ISLASTCN)) { error = ENOTDIR; goto out; } /* * If we are deleting or renaming the entry, keep * track of its tmpfs_dirent so that it can be * easily deleted later. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, curthread); if (error != 0) goto out; /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; if ((dnode->tn_mode & S_ISTXT) && VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, curthread) && VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, curthread)) { error = EPERM; vput(*vpp); *vpp = NULL; goto out; } cnp->cn_flags |= SAVENAME; } else { error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; } } } /* * Store the result of this lookup in the cache. Avoid this if the * request was for creation, as it does not improve timings on * emprical tests. */ if ((cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp)) cache_enter(dvp, *vpp, cnp); out: /* * If there were no errors, *vpp cannot be null and it must be * locked. */ MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp))); return (error); } static int tmpfs_cached_lookup(struct vop_cachedlookup_args *v) { return (tmpfs_lookup1(v->a_dvp, v->a_vpp, v->a_cnp)); } static int tmpfs_lookup(struct vop_lookup_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; int error; /* Check accessibility of requested node as a first step. */ error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); return (tmpfs_lookup1(dvp, vpp, cnp)); } static int tmpfs_create(struct vop_create_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; int error; MPASS(vap->va_type == VREG || vap->va_type == VSOCK); error = tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp)) cache_enter(dvp, *vpp, cnp); return (error); } static int tmpfs_mknod(struct vop_mknod_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO) return (EINVAL); return (tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL)); } struct fileops tmpfs_fnops; static int tmpfs_open(struct vop_open_args *v) { struct vnode *vp; struct tmpfs_node *node; struct file *fp; int error, mode; vp = v->a_vp; mode = v->a_mode; node = VP_TO_TMPFS_NODE(vp); /* * The file is still active but all its names have been removed * (e.g. by a "rmdir $(pwd)"). It cannot be opened any more as * it is about to die. */ if (node->tn_links < 1) return (ENOENT); /* If the file is marked append-only, deny write requests. */ if (node->tn_flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE) error = EPERM; else { error = 0; /* For regular files, the call below is nop. */ KASSERT(vp->v_type != VREG || (node->tn_reg.tn_aobj->flags & OBJ_DEAD) == 0, ("dead object")); vnode_create_vobject(vp, node->tn_size, v->a_td); } fp = v->a_fp; MPASS(fp == NULL || fp->f_data == NULL); if (error == 0 && fp != NULL && vp->v_type == VREG) { tmpfs_ref_node(node); finit_vnode(fp, mode, node, &tmpfs_fnops); } return (error); } static int tmpfs_close(struct vop_close_args *v) { struct vnode *vp = v->a_vp; /* Update node times. */ tmpfs_update(vp); return (0); } int tmpfs_fo_close(struct file *fp, struct thread *td) { struct tmpfs_node *node; node = fp->f_data; if (node != NULL) { MPASS(node->tn_type == VREG); tmpfs_free_node(node->tn_reg.tn_tmp, node); } return (vnops.fo_close(fp, td)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int tmpfs_fplookup_vexec(struct vop_fplookup_vexec_args *v) { struct vnode *vp; struct tmpfs_node *node; struct ucred *cred; mode_t all_x, mode; vp = v->a_vp; node = VP_TO_TMPFS_NODE_SMR(vp); if (__predict_false(node == NULL)) return (EAGAIN); all_x = S_IXUSR | S_IXGRP | S_IXOTH; mode = atomic_load_short(&node->tn_mode); if (__predict_true((mode & all_x) == all_x)) return (0); cred = v->a_cred; return (vaccess_vexec_smr(mode, node->tn_uid, node->tn_gid, cred)); } int tmpfs_access(struct vop_access_args *v) { struct vnode *vp = v->a_vp; accmode_t accmode = v->a_accmode; struct ucred *cred = v->a_cred; mode_t all_x = S_IXUSR | S_IXGRP | S_IXOTH; int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); /* * Common case path lookup. */ if (__predict_true(accmode == VEXEC && (node->tn_mode & all_x) == all_x)) return (0); switch (vp->v_type) { case VDIR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VREG: if (accmode & VWRITE && vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VSOCK: /* FALLTHROUGH */ case VFIFO: break; default: error = EINVAL; goto out; } if (accmode & VWRITE && node->tn_flags & IMMUTABLE) { error = EPERM; goto out; } error = vaccess(vp->v_type, node->tn_mode, node->tn_uid, node->tn_gid, accmode, cred); out: MPASS(VOP_ISLOCKED(vp)); return (error); } int tmpfs_stat(struct vop_stat_args *v) { struct vnode *vp = v->a_vp; struct stat *sb = v->a_sb; vm_object_t obj; struct tmpfs_node *node; int error; node = VP_TO_TMPFS_NODE(vp); tmpfs_update_getattr(vp); error = vop_stat_helper_pre(v); if (__predict_false(error)) return (error); sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = node->tn_id; sb->st_mode = node->tn_mode | VTTOIF(vp->v_type); sb->st_nlink = node->tn_links; sb->st_uid = node->tn_uid; sb->st_gid = node->tn_gid; sb->st_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_rdev : NODEV; sb->st_size = node->tn_size; sb->st_atim.tv_sec = node->tn_atime.tv_sec; sb->st_atim.tv_nsec = node->tn_atime.tv_nsec; sb->st_mtim.tv_sec = node->tn_mtime.tv_sec; sb->st_mtim.tv_nsec = node->tn_mtime.tv_nsec; sb->st_ctim.tv_sec = node->tn_ctime.tv_sec; sb->st_ctim.tv_nsec = node->tn_ctime.tv_nsec; sb->st_birthtim.tv_sec = node->tn_birthtime.tv_sec; sb->st_birthtim.tv_nsec = node->tn_birthtime.tv_nsec; sb->st_blksize = PAGE_SIZE; sb->st_flags = node->tn_flags; sb->st_gen = node->tn_gen; if (vp->v_type == VREG) { obj = node->tn_reg.tn_aobj; sb->st_blocks = (u_quad_t)obj->resident_page_count * PAGE_SIZE; } else sb->st_blocks = node->tn_size; sb->st_blocks /= S_BLKSIZE; return (vop_stat_helper_post(v, error)); } int tmpfs_getattr(struct vop_getattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; vm_object_t obj; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); tmpfs_update_getattr(vp); vap->va_type = vp->v_type; vap->va_mode = node->tn_mode; vap->va_nlink = node->tn_links; vap->va_uid = node->tn_uid; vap->va_gid = node->tn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = node->tn_id; vap->va_size = node->tn_size; vap->va_blocksize = PAGE_SIZE; vap->va_atime = node->tn_atime; vap->va_mtime = node->tn_mtime; vap->va_ctime = node->tn_ctime; vap->va_birthtime = node->tn_birthtime; vap->va_gen = node->tn_gen; vap->va_flags = node->tn_flags; vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_rdev : NODEV; if (vp->v_type == VREG) { obj = node->tn_reg.tn_aobj; vap->va_bytes = (u_quad_t)obj->resident_page_count * PAGE_SIZE; } else vap->va_bytes = node->tn_size; vap->va_filerev = 0; return (0); } int tmpfs_setattr(struct vop_setattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; struct ucred *cred = v->a_cred; struct thread *td = curthread; int error; MPASS(VOP_ISLOCKED(vp)); ASSERT_VOP_IN_SEQC(vp); error = 0; /* Abort if any unsettable attribute is given. */ if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL || vap->va_bytes != VNOVAL) error = EINVAL; if (error == 0 && (vap->va_flags != VNOVAL)) error = tmpfs_chflags(vp, vap->va_flags, cred, td); if (error == 0 && (vap->va_size != VNOVAL)) error = tmpfs_chsize(vp, vap->va_size, cred, td); if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL)) error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, td); if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) error = tmpfs_chmod(vp, vap->va_mode, cred, td); if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL && vap->va_atime.tv_nsec != VNOVAL) || (vap->va_mtime.tv_sec != VNOVAL && vap->va_mtime.tv_nsec != VNOVAL) || (vap->va_birthtime.tv_sec != VNOVAL && vap->va_birthtime.tv_nsec != VNOVAL))) error = tmpfs_chtimes(vp, vap, cred, td); /* * Update the node times. We give preference to the error codes * generated by this function rather than the ones that may arise * from tmpfs_update. */ tmpfs_update(vp); MPASS(VOP_ISLOCKED(vp)); return (error); } static int tmpfs_read(struct vop_read_args *v) { struct vnode *vp; struct uio *uio; struct tmpfs_node *node; vp = v->a_vp; if (vp->v_type != VREG) return (EISDIR); uio = v->a_uio; if (uio->uio_offset < 0) return (EINVAL); node = VP_TO_TMPFS_NODE(vp); tmpfs_set_accessed(VFS_TO_TMPFS(vp->v_mount), node); return (uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio)); } static int tmpfs_read_pgcache(struct vop_read_pgcache_args *v) { struct vnode *vp; struct tmpfs_node *node; vm_object_t object; off_t size; int error; vp = v->a_vp; VNPASS((vn_irflag_read(vp) & VIRF_PGREAD) != 0, vp); if (v->a_uio->uio_offset < 0) return (EINVAL); error = EJUSTRETURN; vfs_smr_enter(); node = VP_TO_TMPFS_NODE_SMR(vp); if (node == NULL) goto out_smr; MPASS(node->tn_type == VREG); MPASS(node->tn_refcount >= 1); object = node->tn_reg.tn_aobj; if (object == NULL) goto out_smr; MPASS(object->type == tmpfs_pager_type); MPASS((object->flags & (OBJ_ANON | OBJ_DEAD | OBJ_SWAP)) == OBJ_SWAP); if (!VN_IS_DOOMED(vp)) { /* size cannot become shorter due to rangelock. */ size = node->tn_size; tmpfs_set_accessed(node->tn_reg.tn_tmp, node); vfs_smr_exit(); error = uiomove_object(object, size, v->a_uio); return (error); } out_smr: vfs_smr_exit(); return (error); } static int tmpfs_write(struct vop_write_args *v) { struct vnode *vp; struct uio *uio; struct tmpfs_node *node; off_t oldsize; int error, ioflag; mode_t newmode; vp = v->a_vp; uio = v->a_uio; ioflag = v->a_ioflag; error = 0; node = VP_TO_TMPFS_NODE(vp); oldsize = node->tn_size; if (uio->uio_offset < 0 || vp->v_type != VREG) return (EINVAL); if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio->uio_offset = node->tn_size; if (uio->uio_offset + uio->uio_resid > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); if (uio->uio_offset + uio->uio_resid > node->tn_size) { error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid, FALSE); if (error != 0) goto out; } error = uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio); node->tn_status |= TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; node->tn_accessed = true; if (node->tn_mode & (S_ISUID | S_ISGID)) { if (priv_check_cred(v->a_cred, PRIV_VFS_RETAINSUGID)) { newmode = node->tn_mode & ~(S_ISUID | S_ISGID); vn_seqc_write_begin(vp); atomic_store_short(&node->tn_mode, newmode); vn_seqc_write_end(vp); } } if (error != 0) (void)tmpfs_reg_resize(vp, oldsize, TRUE); out: MPASS(IMPLIES(error == 0, uio->uio_resid == 0)); MPASS(IMPLIES(error != 0, oldsize == node->tn_size)); return (error); } static int tmpfs_deallocate(struct vop_deallocate_args *v) { return (tmpfs_reg_punch_hole(v->a_vp, v->a_offset, v->a_len)); } static int tmpfs_fsync(struct vop_fsync_args *v) { struct vnode *vp = v->a_vp; MPASS(VOP_ISLOCKED(vp)); tmpfs_check_mtime(vp); tmpfs_update(vp); return (0); } static int tmpfs_remove(struct vop_remove_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(VOP_ISLOCKED(vp)); if (vp->v_type == VDIR) { error = EISDIR; goto out; } dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); de = tmpfs_dir_lookup(dnode, node, v->a_cnp); MPASS(de != NULL); /* Files marked as immutable or append-only cannot be deleted. */ if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) || (dnode->tn_flags & APPEND)) { error = EPERM; goto out; } /* Remove the entry from the directory; as it is a file, we do not * have to change the number of hard links of the directory. */ tmpfs_dir_detach(dvp, de); if (v->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(dvp, v->a_cnp); /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de); node->tn_status |= TMPFS_NODE_CHANGED; node->tn_accessed = true; error = 0; out: return (error); } static int tmpfs_link(struct vop_link_args *v) { struct vnode *dvp = v->a_tdvp; struct vnode *vp = v->a_vp; struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(cnp->cn_flags & HASBUF); MPASS(dvp != vp); /* XXX When can this be false? */ node = VP_TO_TMPFS_NODE(vp); /* Ensure that we do not overflow the maximum number of links imposed * by the system. */ MPASS(node->tn_links <= TMPFS_LINK_MAX); if (node->tn_links == TMPFS_LINK_MAX) { error = EMLINK; goto out; } /* We cannot create links of files marked immutable or append-only. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* Allocate a new directory entry to represent the node. */ error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) goto out; /* Insert the new directory entry into the appropriate directory. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); /* vp link count has changed, so update node times. */ node->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(vp); error = 0; out: return (error); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int tmpfs_rename_relock(struct vnode *fdvp, struct vnode **fvpp, struct vnode *tdvp, struct vnode **tvpp, struct componentname *fcnp, struct componentname *tcnp) { struct vnode *nvp; struct mount *mp; struct tmpfs_dirent *de; int error, restarts = 0; VOP_UNLOCK(tdvp); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp); mp = fdvp->v_mount; relock: restarts += 1; error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(fdvp), NULL, fcnp); if (de == NULL) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if ((fcnp->cn_flags & ISDOTDOT) != 0 || (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.')) error = EINVAL; else error = ENOENT; goto releout; } error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp); /* * Concurrent rename race. */ if (nvp == tdvp) { vrele(nvp); error = EINVAL; goto releout; } vrele(*fvpp); *fvpp = nvp; goto relock; } vrele(*fvpp); *fvpp = nvp; VOP_UNLOCK(*fvpp); /* * Re-resolve tvp and acquire the vnode lock if present. */ de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(tdvp), NULL, tcnp); /* * If tvp disappeared we just carry on. */ if (de == NULL && *tvpp != NULL) { vrele(*tvpp); *tvpp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (de != NULL) { nvp = NULL; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (*tvpp != NULL) vrele(*tvpp); *tvpp = nvp; if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp); /* * fdvp contains fvp, thus tvp (=fdvp) is not empty. */ if (nvp == fdvp) { error = ENOTEMPTY; goto releout; } goto relock; } } tmpfs_rename_restarts += restarts; return (0); releout: vrele(fdvp); vrele(*fvpp); vrele(tdvp); if (*tvpp != NULL) vrele(*tvpp); tmpfs_rename_restarts += restarts; return (error); } static int tmpfs_rename(struct vop_rename_args *v) { struct vnode *fdvp = v->a_fdvp; struct vnode *fvp = v->a_fvp; struct componentname *fcnp = v->a_fcnp; struct vnode *tdvp = v->a_tdvp; struct vnode *tvp = v->a_tvp; struct componentname *tcnp = v->a_tcnp; char *newname; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *fdnode; struct tmpfs_node *fnode; struct tmpfs_node *tnode; struct tmpfs_node *tdnode; int error; bool want_seqc_end; MPASS(VOP_ISLOCKED(tdvp)); MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp))); MPASS(fcnp->cn_flags & HASBUF); MPASS(tcnp->cn_flags & HASBUF); want_seqc_end = false; /* * Disallow cross-device renames. * XXX Why isn't this done by the caller? */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULL && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto out; } /* If source and target are the same file, there is nothing to do. */ if (fvp == tvp) { error = 0; goto out; } /* * If we need to move the directory between entries, lock the * source so that we can safely operate on it. */ if (fdvp != tdvp && fdvp != tvp) { if (vn_lock(fdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { error = tmpfs_rename_relock(fdvp, &fvp, tdvp, &tvp, fcnp, tcnp); if (error != 0) return (error); ASSERT_VOP_ELOCKED(fdvp, "tmpfs_rename: fdvp not locked"); ASSERT_VOP_ELOCKED(tdvp, "tmpfs_rename: tdvp not locked"); if (tvp != NULL) ASSERT_VOP_ELOCKED(tvp, "tmpfs_rename: tvp not locked"); if (fvp == tvp) { error = 0; goto out_locked; } } } if (tvp != NULL) vn_seqc_write_begin(tvp); vn_seqc_write_begin(tdvp); vn_seqc_write_begin(fvp); vn_seqc_write_begin(fdvp); want_seqc_end = true; tmp = VFS_TO_TMPFS(tdvp->v_mount); tdnode = VP_TO_TMPFS_DIR(tdvp); tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp); fdnode = VP_TO_TMPFS_DIR(fdvp); fnode = VP_TO_TMPFS_NODE(fvp); de = tmpfs_dir_lookup(fdnode, fnode, fcnp); /* * Entry can disappear before we lock fdvp, * also avoid manipulating '.' and '..' entries. */ if (de == NULL) { if ((fcnp->cn_flags & ISDOTDOT) != 0 || (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.')) error = EINVAL; else error = ENOENT; goto out_locked; } MPASS(de->td_node == fnode); /* * If re-naming a directory to another preexisting directory * ensure that the target directory is empty so that its * removal causes no side effects. * Kern_rename guarantees the destination to be a directory * if the source is one. */ if (tvp != NULL) { MPASS(tnode != NULL); if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (tdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) { if (tnode->tn_size > 0) { error = ENOTEMPTY; goto out_locked; } } else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) { error = ENOTDIR; goto out_locked; } else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) { error = EISDIR; goto out_locked; } else { MPASS(fnode->tn_type != VDIR && tnode->tn_type != VDIR); } } if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } /* * Ensure that we have enough memory to hold the new name, if it * has to be changed. */ if (fcnp->cn_namelen != tcnp->cn_namelen || bcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0) { newname = malloc(tcnp->cn_namelen, M_TMPFSNAME, M_WAITOK); } else newname = NULL; /* * If the node is being moved to another directory, we have to do * the move. */ if (fdnode != tdnode) { /* * In case we are moving a directory, we have to adjust its * parent to point to the new parent. */ if (de->td_node->tn_type == VDIR) { struct tmpfs_node *n; /* * Ensure the target directory is not a child of the * directory being moved. Otherwise, we'd end up * with stale nodes. */ n = tdnode; /* * TMPFS_LOCK guaranties that no nodes are freed while * traversing the list. Nodes can only be marked as * removed: tn_parent == NULL. */ TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(n); while (n != n->tn_dir.tn_parent) { struct tmpfs_node *parent; if (n == fnode) { TMPFS_NODE_UNLOCK(n); TMPFS_UNLOCK(tmp); error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } parent = n->tn_dir.tn_parent; TMPFS_NODE_UNLOCK(n); if (parent == NULL) { n = NULL; break; } TMPFS_NODE_LOCK(parent); if (parent->tn_dir.tn_parent == NULL) { TMPFS_NODE_UNLOCK(parent); n = NULL; break; } n = parent; } TMPFS_UNLOCK(tmp); if (n == NULL) { error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } TMPFS_NODE_UNLOCK(n); /* Adjust the parent pointer. */ TMPFS_VALIDATE_DIR(fnode); TMPFS_NODE_LOCK(de->td_node); de->td_node->tn_dir.tn_parent = tdnode; TMPFS_NODE_UNLOCK(de->td_node); /* * As a result of changing the target of the '..' * entry, the link count of the source and target * directories has to be adjusted. */ TMPFS_NODE_LOCK(tdnode); TMPFS_ASSERT_LOCKED(tdnode); tdnode->tn_links++; TMPFS_NODE_UNLOCK(tdnode); TMPFS_NODE_LOCK(fdnode); TMPFS_ASSERT_LOCKED(fdnode); fdnode->tn_links--; TMPFS_NODE_UNLOCK(fdnode); } } /* * Do the move: just remove the entry from the source directory * and insert it into the target one. */ tmpfs_dir_detach(fdvp, de); if (fcnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(fdvp, fcnp); if (tcnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(tdvp, tcnp); /* * If the name has changed, we need to make it effective by changing * it in the directory entry. */ if (newname != NULL) { MPASS(tcnp->cn_namelen <= MAXNAMLEN); free(de->ud.td_name, M_TMPFSNAME); de->ud.td_name = newname; tmpfs_dirent_init(de, tcnp->cn_nameptr, tcnp->cn_namelen); fnode->tn_status |= TMPFS_NODE_CHANGED; tdnode->tn_status |= TMPFS_NODE_MODIFIED; } /* * If we are overwriting an entry, we have to remove the old one * from the target directory. */ if (tvp != NULL) { struct tmpfs_dirent *tde; /* Remove the old entry from the target directory. */ tde = tmpfs_dir_lookup(tdnode, tnode, tcnp); tmpfs_dir_detach(tdvp, tde); /* * Free the directory entry we just deleted. Note that the * node referred by it will not be removed until the vnode is * really reclaimed. */ tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde); } tmpfs_dir_attach(tdvp, de); if (tmpfs_use_nc(fvp)) { cache_vop_rename(fdvp, fvp, tdvp, tvp, fcnp, tcnp); } error = 0; out_locked: if (fdvp != tdvp && fdvp != tvp) VOP_UNLOCK(fdvp); out: if (want_seqc_end) { if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); } /* * Release target nodes. * XXX: I don't understand when tdvp can be the same as tvp, but * other code takes care of this... */ if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp != NULL) vput(tvp); /* Release source nodes. */ vrele(fdvp); vrele(fvp); return (error); } static int tmpfs_mkdir(struct vop_mkdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; MPASS(vap->va_type == VDIR); return (tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL)); } static int tmpfs_rmdir(struct vop_rmdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(VOP_ISLOCKED(vp)); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_DIR(vp); /* Directories with more than two entries ('.' and '..') cannot be * removed. */ if (node->tn_size > 0) { error = ENOTEMPTY; goto out; } if ((dnode->tn_flags & APPEND) || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* This invariant holds only if we are not trying to remove "..". * We checked for that above so this is safe now. */ MPASS(node->tn_dir.tn_parent == dnode); /* Get the directory entry associated with node (vp). This was * filled by tmpfs_lookup while looking up the entry. */ de = tmpfs_dir_lookup(dnode, node, v->a_cnp); MPASS(TMPFS_DIRENT_MATCHES(de, v->a_cnp->cn_nameptr, v->a_cnp->cn_namelen)); /* Check flags to see if we are allowed to remove the directory. */ if ((dnode->tn_flags & APPEND) != 0 || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) != 0) { error = EPERM; goto out; } /* Detach the directory entry from the directory (dnode). */ tmpfs_dir_detach(dvp, de); if (v->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(dvp, v->a_cnp); /* No vnode should be allocated for this entry from this point */ TMPFS_NODE_LOCK(node); node->tn_links--; node->tn_dir.tn_parent = NULL; node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; node->tn_accessed = true; TMPFS_NODE_UNLOCK(node); TMPFS_NODE_LOCK(dnode); dnode->tn_links--; dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; dnode->tn_accessed = true; TMPFS_NODE_UNLOCK(dnode); if (tmpfs_use_nc(dvp)) { cache_vop_rmdir(dvp, vp); } /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de); /* Release the deleted vnode (will destroy the node, notify * interested parties and clean it from the cache). */ dnode->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(dvp); error = 0; out: return (error); } static int tmpfs_symlink(struct vop_symlink_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; const char *target = v->a_target; #ifdef notyet /* XXX FreeBSD BUG: kern_symlink is not setting VLNK */ MPASS(vap->va_type == VLNK); #else vap->va_type = VLNK; #endif return (tmpfs_alloc_file(dvp, vpp, vap, cnp, target)); } static int tmpfs_readdir(struct vop_readdir_args *va) { struct vnode *vp; struct uio *uio; struct tmpfs_mount *tm; struct tmpfs_node *node; u_long **cookies; int *eofflag, *ncookies; ssize_t startresid; int error, maxcookies; vp = va->a_vp; uio = va->a_uio; eofflag = va->a_eofflag; cookies = va->a_cookies; ncookies = va->a_ncookies; /* This operation only makes sense on directory nodes. */ if (vp->v_type != VDIR) return (ENOTDIR); maxcookies = 0; node = VP_TO_TMPFS_DIR(vp); tm = VFS_TO_TMPFS(vp->v_mount); startresid = uio->uio_resid; /* Allocate cookies for NFS and compat modules. */ if (cookies != NULL && ncookies != NULL) { maxcookies = howmany(node->tn_size, sizeof(struct tmpfs_dirent)) + 2; *cookies = malloc(maxcookies * sizeof(**cookies), M_TEMP, M_WAITOK); *ncookies = 0; } if (cookies == NULL) error = tmpfs_dir_getdents(tm, node, uio, 0, NULL, NULL); else error = tmpfs_dir_getdents(tm, node, uio, maxcookies, *cookies, ncookies); /* Buffer was filled without hitting EOF. */ if (error == EJUSTRETURN) error = (uio->uio_resid != startresid) ? 0 : EINVAL; if (error != 0 && cookies != NULL && ncookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } if (eofflag != NULL) *eofflag = (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF); return (error); } static int tmpfs_readlink(struct vop_readlink_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; int error; struct tmpfs_node *node; MPASS(uio->uio_offset == 0); MPASS(vp->v_type == VLNK); node = VP_TO_TMPFS_NODE(vp); error = uiomove(node->tn_link_target, MIN(node->tn_size, uio->uio_resid), uio); tmpfs_set_accessed(VFS_TO_TMPFS(vp->v_mount), node); return (error); } /* * VOP_FPLOOKUP_SYMLINK routines are subject to special circumstances, see * the comment above cache_fplookup for details. * * Check tmpfs_alloc_node for tmpfs-specific synchronisation notes. */ static int tmpfs_fplookup_symlink(struct vop_fplookup_symlink_args *v) { struct vnode *vp; struct tmpfs_node *node; char *symlink; vp = v->a_vp; node = VP_TO_TMPFS_NODE_SMR(vp); if (__predict_false(node == NULL)) return (EAGAIN); if (!atomic_load_char(&node->tn_link_smr)) return (EAGAIN); symlink = atomic_load_ptr(&node->tn_link_target); if (symlink == NULL) return (EAGAIN); return (cache_symlink_resolve(v->a_fpl, symlink, node->tn_size)); } static int tmpfs_inactive(struct vop_inactive_args *v) { struct vnode *vp; struct tmpfs_node *node; vp = v->a_vp; node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) vrecycle(vp); else tmpfs_check_mtime(vp); return (0); } static int tmpfs_need_inactive(struct vop_need_inactive_args *ap) { struct vnode *vp; struct tmpfs_node *node; struct vm_object *obj; vp = ap->a_vp; node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) goto need; if (vp->v_type == VREG) { obj = vp->v_object; if (obj->generation != obj->cleangeneration) goto need; } return (0); need: return (1); } int tmpfs_reclaim(struct vop_reclaim_args *v) { struct vnode *vp; struct tmpfs_mount *tmp; struct tmpfs_node *node; bool unlock; vp = v->a_vp; node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); if (vp->v_type == VREG) tmpfs_destroy_vobject(vp, node->tn_reg.tn_aobj); vp->v_object = NULL; TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(node); tmpfs_free_vp(vp); /* * If the node referenced by this vnode was deleted by the user, * we must free its associated data structures (now that the vnode * is being reclaimed). */ unlock = true; if (node->tn_links == 0 && (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0) { node->tn_vpstate = TMPFS_VNODE_DOOMED; unlock = !tmpfs_free_node_locked(tmp, node, true); } if (unlock) { TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); } MPASS(vp->v_data == NULL); return (0); } int tmpfs_print(struct vop_print_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%lx, links %jd\n", node, node->tn_flags, (uintmax_t)node->tn_links); printf("\tmode 0%o, owner %d, group %d, size %jd, status 0x%x\n", node->tn_mode, node->tn_uid, node->tn_gid, (intmax_t)node->tn_size, node->tn_status); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } int tmpfs_pathconf(struct vop_pathconf_args *v) { struct vnode *vp = v->a_vp; int name = v->a_name; long *retval = v->a_retval; int error; error = 0; switch (name) { case _PC_LINK_MAX: *retval = TMPFS_LINK_MAX; break; case _PC_SYMLINK_MAX: *retval = MAXPATHLEN; break; case _PC_NAME_MAX: *retval = NAME_MAX; break; case _PC_PIPE_BUF: if (vp->v_type == VDIR || vp->v_type == VFIFO) *retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *retval = 1; break; case _PC_NO_TRUNC: *retval = 1; break; case _PC_SYNC_IO: *retval = 1; break; case _PC_FILESIZEBITS: *retval = 64; break; default: error = vop_stdpathconf(v); } return (error); } static int tmpfs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct tmpfs_fid_data tfd; struct tmpfs_node *node; struct fid *fhp; node = VP_TO_TMPFS_NODE(ap->a_vp); fhp = ap->a_fhp; fhp->fid_len = sizeof(tfd); /* * Copy into fid_data from the stack to avoid unaligned pointer use. * See the comment in sys/mount.h on struct fid for details. */ tfd.tfd_id = node->tn_id; tfd.tfd_gen = node->tn_gen; memcpy(fhp->fid_data, &tfd, fhp->fid_len); return (0); } static int tmpfs_whiteout(struct vop_whiteout_args *ap) { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct tmpfs_dirent *de; switch (ap->a_flags) { case LOOKUP: return (0); case CREATE: de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); if (de != NULL) return (de->td_node == NULL ? 0 : EEXIST); return (tmpfs_dir_whiteout_add(dvp, cnp)); case DELETE: tmpfs_dir_whiteout_remove(dvp, cnp); return (0); default: panic("tmpfs_whiteout: unknown op"); } } static int tmpfs_vptocnp_dir(struct tmpfs_node *tn, struct tmpfs_node *tnp, struct tmpfs_dirent **pde) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; for (de = tmpfs_dir_first(tnp, &dc); de != NULL; de = tmpfs_dir_next(tnp, &dc)) { if (de->td_node == tn) { *pde = de; return (0); } } return (ENOENT); } static int tmpfs_vptocnp_fill(struct vnode *vp, struct tmpfs_node *tn, struct tmpfs_node *tnp, char *buf, size_t *buflen, struct vnode **dvp) { struct tmpfs_dirent *de; int error, i; error = vn_vget_ino_gen(vp, tmpfs_vn_get_ino_alloc, tnp, LK_SHARED, dvp); if (error != 0) return (error); error = tmpfs_vptocnp_dir(tn, tnp, &de); if (error == 0) { i = *buflen; i -= de->td_namelen; if (i < 0) { error = ENOMEM; } else { bcopy(de->ud.td_name, buf + i, de->td_namelen); *buflen = i; } } if (error == 0) { if (vp != *dvp) VOP_UNLOCK(*dvp); } else { if (vp != *dvp) vput(*dvp); else vrele(vp); } return (error); } static int tmpfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp, **dvp; struct tmpfs_node *tn, *tnp, *tnp1; struct tmpfs_dirent *de; struct tmpfs_mount *tm; char *buf; size_t *buflen; int error; vp = ap->a_vp; dvp = ap->a_vpp; buf = ap->a_buf; buflen = ap->a_buflen; tm = VFS_TO_TMPFS(vp->v_mount); tn = VP_TO_TMPFS_NODE(vp); if (tn->tn_type == VDIR) { tnp = tn->tn_dir.tn_parent; if (tnp == NULL) return (ENOENT); tmpfs_ref_node(tnp); error = tmpfs_vptocnp_fill(vp, tn, tn->tn_dir.tn_parent, buf, buflen, dvp); tmpfs_free_node(tm, tnp); return (error); } restart: TMPFS_LOCK(tm); restart_locked: LIST_FOREACH_SAFE(tnp, &tm->tm_nodes_used, tn_entries, tnp1) { if (tnp->tn_type != VDIR) continue; TMPFS_NODE_LOCK(tnp); tmpfs_ref_node(tnp); /* * tn_vnode cannot be instantiated while we hold the * node lock, so the directory cannot be changed while * we iterate over it. Do this to avoid instantiating * vnode for directories which cannot point to our * node. */ error = tnp->tn_vnode == NULL ? tmpfs_vptocnp_dir(tn, tnp, &de) : 0; if (error == 0) { TMPFS_NODE_UNLOCK(tnp); TMPFS_UNLOCK(tm); error = tmpfs_vptocnp_fill(vp, tn, tnp, buf, buflen, dvp); if (error == 0) { tmpfs_free_node(tm, tnp); return (0); } if (VN_IS_DOOMED(vp)) { tmpfs_free_node(tm, tnp); return (ENOENT); } TMPFS_LOCK(tm); TMPFS_NODE_LOCK(tnp); } if (tmpfs_free_node_locked(tm, tnp, false)) { goto restart; } else { KASSERT(tnp->tn_refcount > 0, ("node %p refcount zero", tnp)); if (tnp->tn_attached) { tnp1 = LIST_NEXT(tnp, tn_entries); TMPFS_NODE_UNLOCK(tnp); } else { TMPFS_NODE_UNLOCK(tnp); goto restart_locked; } } } TMPFS_UNLOCK(tm); return (ENOENT); } /* * Vnode operations vector used for files stored in a tmpfs file system. */ struct vop_vector tmpfs_vnodeop_entries = { .vop_default = &default_vnodeops, .vop_lookup = vfs_cache_lookup, .vop_cachedlookup = tmpfs_cached_lookup, .vop_create = tmpfs_create, .vop_mknod = tmpfs_mknod, .vop_open = tmpfs_open, .vop_close = tmpfs_close, .vop_fplookup_vexec = tmpfs_fplookup_vexec, .vop_fplookup_symlink = tmpfs_fplookup_symlink, .vop_access = tmpfs_access, .vop_stat = tmpfs_stat, .vop_getattr = tmpfs_getattr, .vop_setattr = tmpfs_setattr, .vop_read = tmpfs_read, .vop_read_pgcache = tmpfs_read_pgcache, .vop_write = tmpfs_write, .vop_deallocate = tmpfs_deallocate, .vop_fsync = tmpfs_fsync, .vop_remove = tmpfs_remove, .vop_link = tmpfs_link, .vop_rename = tmpfs_rename, .vop_mkdir = tmpfs_mkdir, .vop_rmdir = tmpfs_rmdir, .vop_symlink = tmpfs_symlink, .vop_readdir = tmpfs_readdir, .vop_readlink = tmpfs_readlink, .vop_inactive = tmpfs_inactive, .vop_need_inactive = tmpfs_need_inactive, .vop_reclaim = tmpfs_reclaim, .vop_print = tmpfs_print, .vop_pathconf = tmpfs_pathconf, .vop_vptofh = tmpfs_vptofh, .vop_whiteout = tmpfs_whiteout, .vop_bmap = VOP_EOPNOTSUPP, .vop_vptocnp = tmpfs_vptocnp, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, + .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(tmpfs_vnodeop_entries); /* * Same vector for mounts which do not use namecache. */ struct vop_vector tmpfs_vnodeop_nonc_entries = { .vop_default = &tmpfs_vnodeop_entries, .vop_lookup = tmpfs_lookup, }; VFS_VOP_VECTOR_REGISTER(tmpfs_vnodeop_nonc_entries); diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index 763e9a943f7d..50829ad3044e 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -1,1754 +1,1778 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup(struct vop_lookup_args *); static int vop_norename(struct vop_rename_args *); static int vop_nostrategy(struct vop_strategy_args *); static int get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, int dirbuflen, off_t *off, char **cpos, int *len, int *eofflag, struct thread *td); static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td); #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap); static int vop_stdstat(struct vop_stat_args *ap); static int vop_stdvput_pair(struct vop_vput_pair_args *ap); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * * Note that every filesystem has to implement either vop_access * or vop_accessx; failing to do so will result in immediate crash * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(), * which calls vop_stdaccess() etc. */ struct vop_vector default_vnodeops = { .vop_default = NULL, .vop_bypass = VOP_EOPNOTSUPP, .vop_access = vop_stdaccess, .vop_accessx = vop_stdaccessx, .vop_advise = vop_stdadvise, .vop_advlock = vop_stdadvlock, .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, .vop_allocate = vop_stdallocate, .vop_deallocate = vop_stddeallocate, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, .vop_stat = vop_stdstat, .vop_fdatasync = vop_stdfdatasync, .vop_getpages = vop_stdgetpages, .vop_getpages_async = vop_stdgetpages_async, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_need_inactive = vop_stdneed_inactive, .vop_ioctl = vop_stdioctl, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, .vop_lock1 = vop_stdlock, .vop_lookup = vop_nolookup, .vop_open = VOP_NULL, .vop_pathconf = VOP_EINVAL, .vop_poll = vop_nopoll, .vop_putpages = vop_stdputpages, .vop_readlink = VOP_EINVAL, .vop_read_pgcache = vop_stdread_pgcache, .vop_rename = vop_norename, .vop_revoke = VOP_PANIC, .vop_strategy = vop_nostrategy, .vop_unlock = vop_stdunlock, .vop_vptocnp = vop_stdvptocnp, .vop_vptofh = vop_stdvptofh, .vop_unp_bind = vop_stdunp_bind, .vop_unp_connect = vop_stdunp_connect, .vop_unp_detach = vop_stdunp_detach, .vop_is_text = vop_stdis_text, .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, .vop_copy_file_range = vop_stdcopy_file_range, .vop_vput_pair = vop_stdvput_pair, }; VFS_VOP_VECTOR_REGISTER(default_vnodeops); /* * Series of placeholder functions for various error returns for * VOPs. */ int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_enoent(struct vop_generic_args *ap) { return (ENOENT); } int vop_eagain(struct vop_generic_args *ap) { return (EAGAIN); } int vop_null(struct vop_generic_args *ap) { return (0); } /* * Helper function to panic on some bad VOPs in some filesystems. */ int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } /* * vop_std and vop_no are default functions for use by * filesystems that need the "default reasonable" implementation for a * particular operation. * * The documentation for the operations they implement exists (if it exists) * in the VOP_(9) manpage (all uppercase). */ /* * Default vop for filesystems that do not support name lookup */ static int vop_nolookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_norename: * * Handle unlock and reference counting for arguments of vop_rename * for filesystems that do not implement rename operation. */ static int vop_norename(struct vop_rename_args *ap) { vop_rename_fail(ap); return (EOPNOTSUPP); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vn_printf(ap->a_vp, "vnode "); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } static int get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, int dirbuflen, off_t *off, char **cpos, int *len, int *eofflag, struct thread *td) { int error, reclen; struct uio uio; struct iovec iov; struct dirent *dp; KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); if (*len == 0) { iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = *off; uio.uio_resid = dirbuflen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; *eofflag = 0; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, NULL, NULL); if (error) return (error); *off = uio.uio_offset; *cpos = dirbuf; *len = (dirbuflen - uio.uio_resid); if (*len == 0) return (ENOENT); } dp = (struct dirent *)(*cpos); reclen = dp->d_reclen; *dpp = dp; /* check for malformed directory.. */ if (reclen < DIRENT_MINSIZE) return (EINVAL); *cpos += reclen; *len -= reclen; return (0); } /* * Check if a named file exists in a given directory vnode. */ static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td) { char *dirbuf, *cpos; int error, eofflag, dirbuflen, len, found; off_t off; struct dirent *dp; struct vattr va; KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); found = 0; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error) return (found); dirbuflen = DEV_BSIZE; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); off = 0; len = 0; do { error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off, &cpos, &len, &eofflag, td); if (error) goto out; if (dp->d_type != DT_WHT && dp->d_fileno != 0 && strcmp(dp->d_name, dirname) == 0) { found = 1; goto out; } } while (len > 0 || !eofflag); out: free(dirbuf, M_TEMP); return (found); } int vop_stdaccess(struct vop_access_args *ap) { KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td)); } int vop_stdaccessx(struct vop_accessx_args *ap) { int error; accmode_t accmode = ap->a_accmode; error = vfs_unixify_accmode(&accmode); if (error != 0) return (error); if (accmode == 0) return (0); return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td)); } /* * Advisory record locking support */ int vop_stdadvlock(struct vop_advlock_args *ap) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; vp = ap->a_vp; /* * Provide atomicity of open(O_CREAT | O_EXCL | O_EXLOCK) for * local filesystems. See vn_open_cred() for reciprocal part. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_LOCAL) != 0 && ap->a_op == F_SETLK && (ap->a_flags & F_FIRSTOPEN) == 0) { VI_LOCK(vp); while ((vp->v_iflag & VI_FOPENING) != 0) msleep(vp, VI_MTX(vp), PLOCK, "lockfo", 0); VI_UNLOCK(vp); } if (ap->a_fl->l_whence == SEEK_END) { /* * The NFSv4 server must avoid doing a vn_lock() here, since it * can deadlock the nfsd threads, due to a LOR. Fortunately * the NFSv4 server always uses SEEK_SET and this code is * only required for the SEEK_END case. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp); if (error) return (error); } else vattr.va_size = 0; return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockasync(struct vop_advlockasync_args *ap) { struct vnode *vp; struct vattr vattr; int error; vp = ap->a_vp; if (ap->a_fl->l_whence == SEEK_END) { /* The size argument is only needed for SEEK_END. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp); if (error) return (error); } else vattr.va_size = 0; return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap) { struct vnode *vp; vp = ap->a_vp; lf_purgelocks(vp, &vp->v_lockf); return (0); } /* * vop_stdpathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int vop_stdpathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ASYNC_IO: *ap->a_retval = _POSIX_ASYNCHRONOUS_IO; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_CAP_PRESENT: case _PC_DEALLOC_PRESENT: case _PC_INF_PRESENT: case _PC_MAC_PRESENT: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. */ int vop_stdlock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { struct vnode *vp = ap->a_vp; struct mtx *ilk; ilk = VI_MTX(vp); return (lockmgr_lock_flags(vp->v_vnlock, ap->a_flags, &ilk->lock_object, ap->a_file, ap->a_line)); } /* See above. */ int vop_stdunlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; return (lockmgr_unlock(vp->v_vnlock)); } /* See above. */ int vop_stdislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { return (lockstatus(ap->a_vp->v_vnlock)); } /* * Variants of the above set. * * Differences are: * - shared locking disablement is not supported * - v_vnlock pointer is not honored */ int vop_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct mtx *ilk; MPASS(vp->v_vnlock == &vp->v_lock); if (__predict_false((flags & ~(LK_TYPE_MASK | LK_NODDLKTREAT | LK_RETRY)) != 0)) goto other; switch (flags & LK_TYPE_MASK) { case LK_SHARED: return (lockmgr_slock(&vp->v_lock, flags, ap->a_file, ap->a_line)); case LK_EXCLUSIVE: return (lockmgr_xlock(&vp->v_lock, flags, ap->a_file, ap->a_line)); } other: ilk = VI_MTX(vp); return (lockmgr_lock_flags(&vp->v_lock, flags, &ilk->lock_object, ap->a_file, ap->a_line)); } int vop_unlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; MPASS(vp->v_vnlock == &vp->v_lock); return (lockmgr_unlock(&vp->v_lock)); } int vop_islocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; MPASS(vp->v_vnlock == &vp->v_lock); return (lockstatus(&vp->v_lock)); } /* * Return true for select/poll. */ int vop_nopoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (POLLNVAL); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(ap) struct vop_getwritemount_args /* { struct vnode *a_vp; struct mount **a_mpp; } */ *ap; { struct mount *mp; struct vnode *vp; /* * Note that having a reference does not prevent forced unmount from * setting ->v_mount to NULL after the lock gets released. This is of * no consequence for typical consumers (most notably vn_start_write) * since in this case the vnode is VIRF_DOOMED. Unmount might have * progressed far enough that its completion is only delayed by the * reference obtained here. The consumer only needs to concern itself * with releasing it. */ vp = ap->a_vp; mp = vfs_ref_from_vp(vp); *(ap->a_mpp) = mp; return (0); } /* * If the file system doesn't implement VOP_BMAP, then return sensible defaults: * - Return the vnode's bufobj instead of any underlying device's bufobj * - Calculate the physical block number as if there were equal size * consecutive blocks, but * - Report no contiguous runs of blocks. */ int vop_stdbmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } int vop_stdfsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { return (vn_fsync_buf(ap->a_vp, ap->a_waitfor)); } static int vop_stdfdatasync(struct vop_fdatasync_args *ap) { return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td)); } int vop_stdfdatasync_buf(struct vop_fdatasync_args *ap) { return (vn_fsync_buf(ap->a_vp, MNT_WAIT)); } /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL); } static int vop_stdgetpages_async(struct vop_getpages_async_args *ap) { int error; error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead); if (ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } int vop_stdkqfilter(struct vop_kqfilter_args *ap) { return vfs_kqfilter(ap); } /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ int vop_stdputpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } int vop_stdvptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } int vop_stdvptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct ucred *cred; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; char *dirbuf, *cpos; int i, error, eofflag, dirbuflen, flags, locked, len, covered; off_t off; ino_t fileno; struct vattr va; struct nameidata nd; struct thread *td; struct dirent *dp; struct vnode *mvp; i = *buflen; error = 0; covered = 0; td = curthread; cred = td->td_ucred; if (vp->v_type != VDIR) return (ENOENT); error = VOP_GETATTR(vp, &va, cred); if (error) return (error); VREF(vp); locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, "..", vp); flags = FREAD; error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL); if (error) { vn_lock(vp, locked | LK_RETRY); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); mvp = *dvp = nd.ni_vp; if (vp->v_mount != (*dvp)->v_mount && ((*dvp)->v_vflag & VV_ROOT) && ((*dvp)->v_mount->mnt_flag & MNT_UNION)) { *dvp = (*dvp)->v_mount->mnt_vnodecovered; VREF(mvp); VOP_UNLOCK(mvp); vn_close(mvp, FREAD, cred, td); VREF(*dvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); covered = 1; } fileno = va.va_fileid; dirbuflen = DEV_BSIZE; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); if ((*dvp)->v_type != VDIR) { error = ENOENT; goto out; } off = 0; len = 0; do { /* call VOP_READDIR of parent */ error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off, &cpos, &len, &eofflag, td); if (error) goto out; if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { if (covered) { VOP_UNLOCK(*dvp); vn_lock(mvp, LK_SHARED | LK_RETRY); if (dirent_exists(mvp, dp->d_name, td)) { error = ENOENT; VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); goto out; } VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); } i -= dp->d_namlen; if (i < 0) { error = ENOMEM; goto out; } if (dp->d_namlen == 1 && dp->d_name[0] == '.') { error = ENOENT; } else { bcopy(dp->d_name, buf + i, dp->d_namlen); error = 0; } goto out; } } while (len > 0 || !eofflag); error = ENOENT; out: free(dirbuf, M_TEMP); if (!error) { *buflen = i; vref(*dvp); } if (covered) { vput(*dvp); vrele(mvp); } else { VOP_UNLOCK(mvp); vn_close(mvp, FREAD, cred, td); } vn_lock(vp, locked | LK_RETRY); return (error); } int vop_stdallocate(struct vop_allocate_args *ap) { #ifdef __notyet__ struct statfs *sfs; off_t maxfilesize = 0; #endif struct iovec aiov; struct vattr vattr, *vap; struct uio auio; off_t fsize, len, cur, offset; uint8_t *buf; struct thread *td; struct vnode *vp; size_t iosize; int error; buf = NULL; error = 0; td = curthread; vap = &vattr; vp = ap->a_vp; len = *ap->a_len; offset = *ap->a_offset; error = VOP_GETATTR(vp, vap, ap->a_cred); if (error != 0) goto out; fsize = vap->va_size; iosize = vap->va_blocksize; if (iosize == 0) iosize = BLKDEV_IOSIZE; if (iosize > maxphys) iosize = maxphys; buf = malloc(iosize, M_TEMP, M_WAITOK); #ifdef __notyet__ /* * Check if the filesystem sets f_maxfilesize; if not use * VOP_SETATTR to perform the check. */ sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = VFS_STATFS(vp->v_mount, sfs, td); if (error == 0) maxfilesize = sfs->f_maxfilesize; free(sfs, M_STATFS); if (error != 0) goto out; if (maxfilesize) { if (offset > maxfilesize || len > maxfilesize || offset + len > maxfilesize) { error = EFBIG; goto out; } } else #endif if (offset + len > vap->va_size) { /* * Test offset + len against the filesystem's maxfilesize. */ VATTR_NULL(vap); vap->va_size = offset + len; error = VOP_SETATTR(vp, vap, ap->a_cred); if (error != 0) goto out; VATTR_NULL(vap); vap->va_size = fsize; error = VOP_SETATTR(vp, vap, ap->a_cred); if (error != 0) goto out; } for (;;) { /* * Read and write back anything below the nominal file * size. There's currently no way outside the filesystem * to know whether this area is sparse or not. */ cur = iosize; if ((offset % iosize) != 0) cur -= (offset % iosize); if (cur > len) cur = len; if (offset < fsize) { aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; error = VOP_READ(vp, &auio, ap->a_ioflag, ap->a_cred); if (error != 0) break; if (auio.uio_resid > 0) { bzero(buf + cur - auio.uio_resid, auio.uio_resid); } } else { bzero(buf, cur); } aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; error = VOP_WRITE(vp, &auio, ap->a_ioflag, ap->a_cred); if (error != 0) break; len -= cur; offset += cur; if (len == 0) break; if (should_yield()) break; } out: *ap->a_len = len; *ap->a_offset = offset; free(buf, M_TEMP); return (error); } static int vp_zerofill(struct vnode *vp, struct vattr *vap, off_t *offsetp, off_t *lenp, int ioflag, struct ucred *cred) { int iosize; int error = 0; struct iovec aiov; struct uio auio; struct thread *td; off_t offset, len; iosize = vap->va_blocksize; td = curthread; offset = *offsetp; len = *lenp; if (iosize == 0) iosize = BLKDEV_IOSIZE; /* If va_blocksize is 512 bytes, iosize will be 4 kilobytes */ iosize = min(iosize * 8, ZERO_REGION_SIZE); while (len > 0) { int xfersize = iosize; if (offset % iosize != 0) xfersize -= offset % iosize; if (xfersize > len) xfersize = len; aiov.iov_base = __DECONST(void *, zero_region); aiov.iov_len = xfersize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = xfersize; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; error = VOP_WRITE(vp, &auio, ioflag, cred); if (error != 0) { len -= xfersize - auio.uio_resid; offset += xfersize - auio.uio_resid; break; } len -= xfersize; offset += xfersize; } *offsetp = offset; *lenp = len; return (error); } int vop_stddeallocate(struct vop_deallocate_args *ap) { struct vnode *vp; off_t offset, len; struct ucred *cred; int error; struct vattr va; off_t noff, xfersize, rem; vp = ap->a_vp; offset = *ap->a_offset; cred = ap->a_cred; error = VOP_GETATTR(vp, &va, cred); if (error) return (error); len = omin((off_t)va.va_size - offset, *ap->a_len); while (len > 0) { noff = offset; error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred); if (error) { if (error != ENXIO) /* XXX: Is it okay to fallback further? */ goto out; /* * No more data region to be filled */ offset += len; len = 0; error = 0; break; } KASSERT(noff >= offset, ("FIOSEEKDATA going backward")); if (noff != offset) { xfersize = omin(noff - offset, len); len -= xfersize; offset += xfersize; if (len == 0) break; } error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred); if (error) goto out; /* Fill zeroes */ xfersize = rem = omin(noff - offset, len); error = vp_zerofill(vp, &va, &offset, &rem, ap->a_ioflag, cred); if (error) { len -= xfersize - rem; goto out; } len -= xfersize; if (should_yield()) break; } /* Handle the case when offset is beyond EOF */ if (len < 0) len = 0; out: *ap->a_offset = offset; *ap->a_len = len; return (error); } int vop_stdadvise(struct vop_advise_args *ap) { struct vnode *vp; struct bufobj *bo; daddr_t startn, endn; off_t bstart, bend, start, end; int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { case POSIX_FADV_WILLNEED: /* * Do nothing for now. Filesystems should provide a * custom method which starts an asynchronous read of * the requested region. */ error = 0; break; case POSIX_FADV_DONTNEED: error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); break; } /* * Round to block boundaries (and later possibly further to * page boundaries). Applications cannot reasonably be aware * of the boundaries, and the rounding must be to expand at * both extremities to cover enough. It still doesn't cover * read-ahead. For partial blocks, this gives unnecessary * discarding of buffers but is efficient enough since the * pages usually remain in VMIO for some time. */ bsize = vp->v_bufobj.bo_bsize; bstart = rounddown(ap->a_start, bsize); bend = roundup(ap->a_end, bsize); /* * Deactivate pages in the specified range from the backing VM * object. Pages that are resident in the buffer cache will * remain wired until their corresponding buffers are released * below. */ if (vp->v_object != NULL) { start = trunc_page(bstart); end = round_page(bend); VM_OBJECT_RLOCK(vp->v_object); vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_RUNLOCK(vp->v_object); } bo = &vp->v_bufobj; BO_RLOCK(bo); startn = bstart / bsize; endn = bend / bsize; error = bnoreuselist(&bo->bo_clean, bo, startn, endn); if (error == 0) error = bnoreuselist(&bo->bo_dirty, bo, startn, endn); BO_RUNLOCK(bo); VOP_UNLOCK(vp); break; default: error = EINVAL; break; } return (error); } int vop_stdunp_bind(struct vop_unp_bind_args *ap) { ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } int vop_stdunp_connect(struct vop_unp_connect_args *ap) { *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } int vop_stdunp_detach(struct vop_unp_detach_args *ap) { ap->a_vp->v_unpcb = NULL; return (0); } static int vop_stdis_text(struct vop_is_text_args *ap) { return (ap->a_vp->v_writecount < 0); } int vop_stdset_text(struct vop_set_text_args *ap) { struct vnode *vp; struct mount *mp; int error, n; vp = ap->a_vp; /* * Avoid the interlock if execs are already present. */ n = atomic_load_int(&vp->v_writecount); for (;;) { if (n > -1) { break; } if (atomic_fcmpset_int(&vp->v_writecount, &n, n - 1)) { return (0); } } VI_LOCK(vp); if (vp->v_writecount > 0) { error = ETXTBSY; } else { /* * If requested by fs, keep a use reference to the * vnode until the last text reference is released. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 && vp->v_writecount == 0) { VNPASS((vp->v_iflag & VI_TEXT_REF) == 0, vp); vp->v_iflag |= VI_TEXT_REF; vrefl(vp); } atomic_subtract_int(&vp->v_writecount, 1); error = 0; } VI_UNLOCK(vp); return (error); } static int vop_stdunset_text(struct vop_unset_text_args *ap) { struct vnode *vp; int error, n; bool last; vp = ap->a_vp; /* * Avoid the interlock if this is not the last exec. */ n = atomic_load_int(&vp->v_writecount); for (;;) { if (n >= -1) { break; } if (atomic_fcmpset_int(&vp->v_writecount, &n, n + 1)) { return (0); } } last = false; VI_LOCK(vp); if (vp->v_writecount < 0) { if ((vp->v_iflag & VI_TEXT_REF) != 0 && vp->v_writecount == -1) { last = true; vp->v_iflag &= ~VI_TEXT_REF; } atomic_add_int(&vp->v_writecount, 1); error = 0; } else { error = EINVAL; } VI_UNLOCK(vp); if (last) vunref(vp); return (error); } -static int -vop_stdadd_writecount(struct vop_add_writecount_args *ap) +static int __always_inline +vop_stdadd_writecount_impl(struct vop_add_writecount_args *ap, bool handle_msync) { struct vnode *vp; - struct mount *mp; + struct mount *mp __diagused; int error; vp = ap->a_vp; + +#ifdef INVARIANTS + mp = vp->v_mount; + if (mp != NULL) { + if (handle_msync) { + VNPASS((mp->mnt_kern_flag & MNTK_NOMSYNC) == 0, vp); + } else { + VNPASS((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0, vp); + } + } +#endif + VI_LOCK_FLAGS(vp, MTX_DUPOK); - if (vp->v_writecount < 0) { + if (__predict_false(vp->v_writecount < 0)) { error = ETXTBSY; } else { VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); - if (vp->v_writecount == 0) { - mp = vp->v_mount; - if (mp != NULL && (mp->mnt_kern_flag & MNTK_NOMSYNC) == 0) - vlazy(vp); + if (handle_msync && vp->v_writecount == 0) { + vlazy(vp); } vp->v_writecount += ap->a_inc; error = 0; } VI_UNLOCK(vp); return (error); } +int +vop_stdadd_writecount(struct vop_add_writecount_args *ap) +{ + + return (vop_stdadd_writecount_impl(ap, true)); +} + +int +vop_stdadd_writecount_nomsync(struct vop_add_writecount_args *ap) +{ + + return (vop_stdadd_writecount_impl(ap, false)); +} + int vop_stdneed_inactive(struct vop_need_inactive_args *ap) { return (1); } int vop_stdioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct vattr va; off_t *offp; int error; switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: vp = ap->a_vp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (EBADF); if (vp->v_type == VREG) error = VOP_GETATTR(vp, &va, ap->a_cred); else error = ENOTTY; if (error == 0) { offp = ap->a_data; if (*offp < 0 || *offp >= va.va_size) error = ENXIO; else if (ap->a_command == FIOSEEKHOLE) *offp = va.va_size; } VOP_UNLOCK(vp); break; default: error = ENOTTY; break; } return (error); } /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. */ int vfs_stdroot (mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdstatfs (mp, sbp) struct mount *mp; struct statfs *sbp; { return (EOPNOTSUPP); } int vfs_stdquotactl (mp, cmds, uid, arg, mp_busy) struct mount *mp; int cmds; uid_t uid; void *arg; bool *mp_busy; { return (EOPNOTSUPP); } int vfs_stdsync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *vp, *mvp; struct thread *td; int error, lockreq, allerror = 0; td = curthread; lockreq = LK_EXCLUSIVE | LK_INTERLOCK; if (waitfor != MNT_WAIT) lockreq |= LK_NOWAIT; /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; vput(vp); } return (allerror); } int vfs_stdnosync (mp, waitfor) struct mount *mp; int waitfor; { return (0); } static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap) { int error; error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } int vfs_stdvget (mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdfhtovp (mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdinit (vfsp) struct vfsconf *vfsp; { return (0); } int vfs_stduninit (vfsp) struct vfsconf *vfsp; { return(0); } int vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname) struct mount *mp; int cmd; struct vnode *filename_vp; int attrnamespace; const char *attrname; { if (filename_vp != NULL) VOP_UNLOCK(filename_vp); return (EOPNOTSUPP); } int vfs_stdsysctl(mp, op, req) struct mount *mp; fsctlop_t op; struct sysctl_req *req; { return (EOPNOTSUPP); } static vop_bypass_t * bp_by_off(struct vop_vector *vop, struct vop_generic_args *a) { return (*(vop_bypass_t **)((char *)vop + a->a_desc->vdesc_vop_offset)); } int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a) { vop_bypass_t *bp; int prev_stops, rc; bp = bp_by_off(vop, a); MPASS(bp != NULL); prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT); rc = bp(a); sigallowstop(prev_stops); return (rc); } static int vop_stdstat(struct vop_stat_args *a) { struct vattr vattr; struct vattr *vap; struct vnode *vp; struct stat *sb; int error; u_short mode; vp = a->a_vp; sb = a->a_sb; error = vop_stat_helper_pre(a); if (error != 0) return (error); vap = &vattr; /* * Initialize defaults for new and unusual fields, so that file * systems which don't support these fields don't need to know * about them. */ vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; vap->va_fsid = VNOVAL; vap->va_gen = 0; vap->va_rdev = NODEV; error = VOP_GETATTR(vp, vap, a->a_active_cred); if (error) goto out; /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: error = EBADF; goto out; } sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) { error = EOVERFLOW; goto out; } sb->st_size = vap->va_size; sb->st_atim.tv_sec = vap->va_atime.tv_sec; sb->st_atim.tv_nsec = vap->va_atime.tv_nsec; sb->st_mtim.tv_sec = vap->va_mtime.tv_sec; sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec; sb->st_ctim.tv_sec = vap->va_ctime.tv_sec; sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec; sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec; sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Use minimum/default of PAGE_SIZE (e.g. for VCHR). */ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); sb->st_flags = vap->va_flags; sb->st_blocks = vap->va_bytes / S_BLKSIZE; sb->st_gen = vap->va_gen; out: return (vop_stat_helper_post(a, error)); } static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap __unused) { return (EJUSTRETURN); } static int vop_stdvput_pair(struct vop_vput_pair_args *ap) { struct vnode *dvp, *vp, **vpp; dvp = ap->a_dvp; vpp = ap->a_vpp; vput(dvp); if (vpp != NULL && ap->a_unlock_vp && (vp = *vpp) != NULL) vput(vp); return (0); } diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 3d04b6f68784..c84d015b011c 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -1,1156 +1,1157 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct cache_fpl; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ seqc_t v_seqc; /* i modification count */ uint32_t v_nchash; /* u namecache hash */ u_int v_hash; struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. */ union { struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ struct cdev *v_rdev; /* v device (VCHR, VBLK) */ struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_vnodelist; /* l vnode lists */ TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_short v_iflag; /* i vnode flags (see below) */ u_short v_vflag; /* v vnode flags */ u_short v_mflag; /* l mnt-specific vnode flags */ short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ int v_seqc_users; /* i modifications pending */ }; #ifndef DEBUG_LOCKS #ifdef _LP64 /* * Not crossing 448 bytes fits 9 vnodes per page. If you have to add fields * to the structure and there is nothing which can be done to prevent growth * then so be it. But don't grow it without a good reason. */ _Static_assert(sizeof(struct vnode) <= 448, "vnode size crosses 448 bytes"); #endif #endif #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VIRF_PGREAD 0x0002 /* Direct reads from the page cache are permitted, never cleared once set */ #define VIRF_MOUNTPOINT 0x0004 /* This vnode is mounted on */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0002 /* Mount in progress */ #define VI_DOINGINACT 0x0004 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x0008 /* Need to call inactive */ #define VI_DEFINACT 0x0010 /* deferred inactive */ #define VI_FOPENING 0x0020 /* In open, with opening process having the first right to advlock file */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VV_UNREF 0x4000 /* vunref, do not drop lock in inactive() */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ u_short va_padding0; uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ nlink_t va_nlink; /* number of references to file */ dev_t va_fsid; /* filesystem id */ ino_t va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_NOREUSE 0x0200 /* VMIO data won't be reused */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_DATASYNC 0x8000 /* do only data I/O synchronously */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; extern const u_int io_hold_cnt; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define V_VMIO 0x0010 /* vinvalbuf: called during pageout */ #define V_ALLOWCLEAN 0x0020 /* vinvalbuf: allow clean buffers after flush */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern u_long desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ extern u_int vn_lock_pair_pause_max; #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ int vdesc_vop_offset; vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #define ASSERT_VOP_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_IN_SEQC(vp) ((void)0) #define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 #define VN_OPEN_INVFS 0x00000008 /* copy_file_range kernel flags */ #define COPY_FILE_RANGE_KFLAGS 0xff000000 #define COPY_FILE_RANGE_TIMEO1SEC 0x01000000 /* Return after 1sec. */ /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct freebsd11_stat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(u_long newhashsize); #define VFS_CACHE_DROPOLD 0x1 void cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp, int flags); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_vnode_init(struct vnode *vp); void cache_purge(struct vnode *vp); void cache_purge_vgone(struct vnode *vp); void cache_purge_negative(struct vnode *vp); void cache_purgevfs(struct mount *mp); char *cache_symlink_alloc(size_t size, int flags); void cache_symlink_free(char *string, size_t size); int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len); void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp); void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp); #ifdef INVARIANTS void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); #else static inline void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { } #endif void cache_fast_lookup_enabled_recalc(void); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); int freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(void); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen); int vn_getcwd(char *buf, char **retbuf, size_t *buflen); int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf); int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf); int vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, const char *hdrl_name, size_t hrdl_name_length, char **retbuf, char **freebuf, size_t *buflen); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred); int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred); void vattr_null(struct vattr *vap); void vlazy(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int flags); enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vget_finish_ref(struct vnode *vp, enum vgetstate vs); void vget_abort(struct vnode *vp, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdnz(struct vnode *); bool vhold_smr(struct vnode *); int vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, int ioflg, struct ucred *active_cred, struct ucred *file_cred); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_need_pageq_flush(struct vnode *vp); bool vn_isdisk_error(struct vnode *vp, int *errp); bool vn_isdisk(struct vnode *vp); int _vn_lock(struct vnode *vp, int flags, const char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) void vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, bool vp2_locked); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); void vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_read_from_obj(struct vnode *vp, struct uio *uio); int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); void vn_seqc_write_begin_locked(struct vnode *vp); void vn_seqc_write_begin(struct vnode *vp); void vn_seqc_write_end_locked(struct vnode *vp); void vn_seqc_write_end(struct vnode *vp); #define vn_seqc_read_any(vp) seqc_read_any(&(vp)->v_seqc) #define vn_seqc_read_notmodify(vp) seqc_read_notmodify(&(vp)->v_seqc) #define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_tryrlock(vp, start, end) \ rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_trywlock(vp, start, end) \ rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_irflag_read(vp) atomic_load_short(&(vp)->v_irflag) void vn_irflag_set_locked(struct vnode *vp, short toset); void vn_irflag_set(struct vnode *vp, short toset); void vn_irflag_set_cond_locked(struct vnode *vp, short toset); void vn_irflag_set_cond(struct vnode *vp, short toset); void vn_irflag_unset_locked(struct vnode *vp, short tounset); void vn_irflag_unset(struct vnode *vp, short tounset); int vfs_cache_lookup(struct vop_lookup_args *ap); int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); struct vnode *vnlru_alloc_marker(void); void vnlru_free_marker(struct vnode *); void vnlru_free_vfsops(int, struct vfsops *, struct vnode *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_lock(struct vop_lock1_args *); int vop_unlock(struct vop_unlock_args *); int vop_islocked(struct vop_islocked_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stddeallocate(struct vop_deallocate_args *ap); int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); +int vop_stdadd_writecount_nomsync(struct vop_add_writecount_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_eagain(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); void vop_deleteextattr_post(void *a, int rc); void vop_link_pre(void *a); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_pre(void *a); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_read_pgcache_post(void *ap, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_pre(void *a); void vop_setattr_post(void *a, int rc); void vop_setacl_pre(void *a); void vop_setacl_post(void *a, int rc); void vop_setextattr_pre(void *a); void vop_setextattr_post(void *a, int rc); void vop_symlink_pre(void *a); void vop_symlink_post(void *a, int rc); int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); #ifdef DEBUG_VFS_LOCKS void vop_fdatasync_debugpre(void *a); void vop_fdatasync_debugpost(void *a, int rc); void vop_fplookup_vexec_debugpre(void *a); void vop_fplookup_vexec_debugpost(void *a, int rc); void vop_fplookup_symlink_debugpre(void *a); void vop_fplookup_symlink_debugpost(void *a, int rc); void vop_fsync_debugpre(void *a); void vop_fsync_debugpost(void *a, int rc); void vop_strategy_debugpre(void *a); void vop_lock_debugpre(void *a); void vop_lock_debugpost(void *a, int rc); void vop_unlock_debugpre(void *a); void vop_need_inactive_debugpre(void *a); void vop_need_inactive_debugpost(void *a, int rc); void vop_mkdir_debugpost(void *a, int rc); #else #define vop_fdatasync_debugpre(x) do { } while (0) #define vop_fdatasync_debugpost(x, y) do { } while (0) #define vop_fplookup_vexec_debugpre(x) do { } while (0) #define vop_fplookup_vexec_debugpost(x, y) do { } while (0) #define vop_fplookup_symlink_debugpre(x) do { } while (0) #define vop_fplookup_symlink_debugpost(x, y) do { } while (0) #define vop_fsync_debugpre(x) do { } while (0) #define vop_fsync_debugpost(x, y) do { } while (0) #define vop_strategy_debugpre(x) do { } while (0) #define vop_lock_debugpre(x) do { } while (0) #define vop_lock_debugpost(x, y) do { } while (0) #define vop_unlock_debugpre(x) do { } while (0) #define vop_need_inactive_debugpre(x) do { } while (0) #define vop_need_inactive_debugpost(x, y) do { } while (0) #define vop_mkdir_debugpost(x, y) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define vop_stat_helper_pre(ap) ({ \ int _error; \ AUDIT_ARG_VNODE1(ap->a_vp); \ _error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\ if (__predict_true(_error == 0)) \ bzero(ap->a_sb, sizeof(*ap->a_sb)); \ _error; \ }) #define vop_stat_helper_post(ap, error) ({ \ int _error = (error); \ if (priv_check_cred_vfs_generation(ap->a_active_cred)) \ ap->a_sb->st_gen = 0; \ _error; \ }) #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) #ifdef INVARIANTS #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) \ do { \ int error_; \ \ error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d", \ error_)); \ } while (0) #define VOP_SET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_SET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d", \ error_)); \ } while (0) #define VOP_UNSET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_UNSET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d", \ error_)); \ } while (0) #else #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) #define VOP_SET_TEXT_CHECKED(vp) VOP_SET_TEXT((vp)) #define VOP_UNSET_TEXT_CHECKED(vp) VOP_UNSET_TEXT((vp)) #endif #define VN_IS_DOOMED(vp) __predict_false((vn_irflag_read(vp) & VIRF_DOOMED) != 0) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefact(struct vnode *vp); void v_addpollinfo(struct vnode *vp); static __inline int vrefcnt(struct vnode *vp) { return (vp->v_usecount); } #define vholdl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vhold(vp); \ } while (0) #define vrefl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vref(vp); \ } while (0) int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) #define VOP_EAGAIN ((void*)(uintptr_t)vop_eagain) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(u_long newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); void vn_fsid(struct vnode *vp, struct vattr *va); int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); int vn_lktype_write(struct mount *mp, struct vnode *vp); #define VOP_UNLOCK_FLAGS(vp, flags) ({ \ struct vnode *_vp = (vp); \ int _flags = (flags); \ int _error; \ \ if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0) \ panic("%s: unsupported flags %x\n", __func__, flags); \ _error = VOP_UNLOCK(_vp); \ if (_flags & LK_INTERLOCK) \ VI_UNLOCK(_vp); \ _error; \ }) #include #define VFS_VOP_VECTOR_REGISTER(vnodeops) \ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) #define VFS_SMR_DECLARE \ extern smr_t vfs_smr #define VFS_SMR() vfs_smr #define vfs_smr_enter() smr_enter(VFS_SMR()) #define vfs_smr_exit() smr_exit(VFS_SMR()) #define vfs_smr_synchronize() smr_synchronize(VFS_SMR()) #define vfs_smr_entered_load(ptr) smr_entered_load((ptr), VFS_SMR()) #define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) #define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) #define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) #define vn_load_v_data_smr(vp) ({ \ struct vnode *_vp = (vp); \ \ VFS_SMR_ASSERT_ENTERED(); \ atomic_load_consume_ptr(&(_vp)->v_data);\ }) #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */