Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 306802) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 306803) @@ -1,2518 +1,2518 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (sys_shutdown && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } #ifndef __FreeBSD_kernel__ static int zfs_create_unique_device(dev_t *dev) { major_t new_major; do { ASSERT3U(zfs_minor, <=, MAXMIN32); minor_t start = zfs_minor; do { mutex_enter(&zfs_dev_mtx); if (zfs_minor >= MAXMIN32) { /* * If we're still using the real major * keep out of /dev/zfs and /dev/zvol minor * number space. If we're using a getudev()'ed * major number, we can use all of its minors. */ if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) zfs_minor = ZFS_MIN_MINOR; else zfs_minor = 0; } else { zfs_minor++; } *dev = makedevice(zfs_major, zfs_minor); mutex_exit(&zfs_dev_mtx); } while (vfs_devismounted(*dev) && zfs_minor != start); if (zfs_minor == start) { /* * We are using all ~262,000 minor numbers for the * current major number. Create a new major number. */ if ((new_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "zfs_mount: Can't get unique major " "device number."); return (-1); } mutex_enter(&zfs_dev_mtx); zfs_major = new_major; zfs_minor = 0; mutex_exit(&zfs_dev_mtx); } else { break; } /* CONSTANTCONDITION */ } while (1); return (0); } #endif /* !__FreeBSD_kernel__ */ static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; #ifdef illumos boolean_t devices = B_FALSE; boolean_t do_devices = B_FALSE; #endif boolean_t xattr = B_FALSE; boolean_t do_xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_config_sync() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); #ifdef illumos error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); #endif error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (SET_ERROR(ENOENT)); /* * If we have a NULL data pointer * then assume the id's aren't changing and * return EEXIST to the dmu to let it know to * use the same ids */ if (data == NULL) return (SET_ERROR(EEXIST)); if (bonustype == DMU_OT_ZNODE) { znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; sa_hdr_phys_t *sap = data; sa_hdr_phys_t sa = *sap; boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled * in yet. */ *userp = 0; *groupp = 0; return (0); } if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; } else { VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); if (swap) { *userp = BSWAP_64(*userp); *groupp = BSWAP_64(*groupp); } } return (0); } static void fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, char *domainbuf, int buflen, uid_t *ridp) { uint64_t fuid; const char *domain; fuid = strtonum(fuidstr, NULL); domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); if (domain) (void) strlcpy(domainbuf, domain, buflen); else domainbuf[0] = '\0'; *ridp = FUID_RID(fuid); } static uint64_t zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) { switch (type) { case ZFS_PROP_USERUSED: return (DMU_USERUSED_OBJECT); case ZFS_PROP_GROUPUSED: return (DMU_GROUPUSED_OBJECT); case ZFS_PROP_USERQUOTA: return (zfsvfs->z_userquota_obj); case ZFS_PROP_GROUPQUOTA: return (zfsvfs->z_groupquota_obj); } return (0); } int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) { int error; zap_cursor_t zc; zap_attribute_t za; zfs_useracct_t *buf = vbuf; uint64_t obj; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) { *bufsizep = 0; return (0); } for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > *bufsizep) break; fuidstr_to_sid(zfsvfs, za.za_name, buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); buf->zu_space = za.za_first_integer; buf++; } if (error == ENOENT) error = 0; ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; *cookiep = zap_cursor_serialize(&zc); zap_cursor_fini(&zc); return (error); } /* * buf must be big enough (eg, 32 bytes) */ static int id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, char *buf, boolean_t addok) { uint64_t fuid; int domainid = 0; if (domain && domain[0]) { domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); if (domainid == -1) return (SET_ERROR(ENOENT)); } fuid = FUID_ENCODE(domainid, rid); (void) sprintf(buf, "%llx", (longlong_t)fuid); return (0); } int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valp) { char buf[32]; int err; uint64_t obj; *valp = 0; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) return (0); err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); if (err) return (err); err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); if (err == ENOENT) err = 0; return (err); } int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota) { char buf[32]; int err; dmu_tx_t *tx; uint64_t *objp; boolean_t fuid_dirtied; if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) return (SET_ERROR(EINVAL)); if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) return (SET_ERROR(ENOTSUP)); objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : &zfsvfs->z_groupquota_obj; err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); if (err) return (err); fuid_dirtied = zfsvfs->z_fuid_dirty; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); if (*objp == 0) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, zfs_userquota_prop_prefixes[type]); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&zfsvfs->z_lock); if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); dmu_tx_commit(tx); return (err); } boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; int err; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); (void) sprintf(buf, "%llx", (longlong_t)fuid); err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); if (err != 0) return (B_FALSE); err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); if (err != 0) return (B_FALSE); return (used >= quota); } boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { uint64_t fuid; uint64_t quotaobj; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; fuid = isgroup ? zp->z_gid : zp->z_uid; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } int zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); /* * We claim to always be readonly so we can open snapshots; * other ZPL code will prevent us from writing to snapshots. */ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); if (error) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); #ifdef DIAGNOSTIC rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); #else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); #endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; else zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } return (0); } extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; /* * This is a barrier to prevent the filesystem from going away in * zfs_znode_move() until we can safely ensure that the filesystem is * not unmounted. We consider the filesystem valid before the barrier * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; vnode_t *vp; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; #ifdef illumos /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; if (zfs_create_unique_device(&mount_dev) == -1) { error = SET_ERROR(ENODEV); goto out; } ASSERT(vfs_devismounted(mount_dev) == 0); #endif if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | vfsp->mnt_vfc->vfc_typenum & 0xFF; /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef SECLABEL /* * Convert a decimal digit string to a uint64_t integer. */ static int str_to_uint64(char *str, uint64_t *objnum) { uint64_t num = 0; while (*str) { if (*str < '0' || *str > '9') return (SET_ERROR(EINVAL)); num = num*10 + *str++ - '0'; } *objnum = num; return (0); } /* * The boot path passed from the boot loader is in the form of * "rootpool-name/root-filesystem-object-number'. Convert this * string to a dataset name: "rootpool-name/root-filesystem-name". */ static int zfs_parse_bootfs(char *bpath, char *outpath) { char *slashp; uint64_t objnum; int error; if (*bpath == 0 || *bpath == '/') return (SET_ERROR(EINVAL)); (void) strcpy(outpath, bpath); slashp = strchr(bpath, '/'); /* if no '/', just return the pool name */ if (slashp == NULL) { return (0); } /* if not a number, just return the root dataset name */ if (str_to_uint64(slashp+1, &objnum)) { return (0); } *slashp = '\0'; error = dsl_dsobj_to_dsname(bpath, objnum, outpath); *slashp = '/'; return (error); } /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : EACCES); } return (SET_ERROR(EACCES)); } /* * Determine whether the mount is allowed according to MAC check. * by comparing (where appropriate) label of the dataset against * the label of the zone being mounted into. If the dataset has * no label, create one. * * Returns 0 if access allowed, error otherwise (e.g. EACCES) */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) { int error, retv; zone_t *mntzone = NULL; ts_label_t *mnt_tsl; bslabel_t *mnt_sl; bslabel_t ds_sl; char ds_hexsl[MAXNAMELEN]; retv = EACCES; /* assume the worst */ /* * Start by getting the dataset label if it exists. */ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) return (SET_ERROR(EACCES)); /* * If labeling is NOT enabled, then disallow the mount of datasets * which have a non-default label already. No other label checks * are needed. */ if (!is_system_labeled()) { if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); return (SET_ERROR(EACCES)); } /* * Get the label of the mountpoint. If mounting into the global * zone (i.e. mountpoint is not within an active zone and the * zoned property is off), the label must be default or * admin_low/admin_high only; no other checks are needed. */ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); if (mntzone->zone_id == GLOBAL_ZONEID) { uint64_t zoned; zone_rele(mntzone); if (dsl_prop_get_integer(osname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EACCES)); if (!zoned) return (zfs_check_global_label(osname, ds_hexsl)); else /* * This is the case of a zone dataset being mounted * initially, before the zone has been fully created; * allow this mount into global zone. */ return (0); } mnt_tsl = mntzone->zone_slabel; ASSERT(mnt_tsl != NULL); label_hold(mnt_tsl); mnt_sl = label2bslabel(mnt_tsl); if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { /* * The dataset doesn't have a real label, so fabricate one. */ char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && dsl_prop_set_string(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), ZPROP_SRC_LOCAL, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { /* * Now compare labels to complete the MAC check. If the * labels are equal then allow access. If the mountpoint * label dominates the dataset label, allow readonly access. * Otherwise, access is denied. */ if (blequal(mnt_sl, &ds_sl)) retv = 0; else if (bldominates(mnt_sl, &ds_sl)) { vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); retv = 0; } } label_rele(mnt_tsl); zone_rele(mntzone); return (retv); } #endif /* SECLABEL */ #ifdef OPENSOLARIS_MOUNTROOT static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { int error = 0; static int zfsrootdone = 0; zfsvfs_t *zfsvfs = NULL; znode_t *zp = NULL; vnode_t *vp = NULL; char *zfs_bootfs; char *zfs_devid; ASSERT(vfsp); /* * The filesystem that we mount as root is defined in the * boot property "zfs-bootfs" with a format of * "poolname/root-dataset-objnum". */ if (why == ROOT_INIT) { if (zfsrootdone++) return (SET_ERROR(EBUSY)); /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do * something better by looking at the timestamp on * an uberblock, so just set it to -1. */ clkset(-1); if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { cmn_err(CE_NOTE, "spa_get_bootfs: can not get " "bootfs name"); return (SET_ERROR(EINVAL)); } zfs_devid = spa_get_bootprop("diskdevid"); error = spa_import_rootpool(rootfs.bo_name, zfs_devid); if (zfs_devid) spa_free_bootprop(zfs_devid); if (error) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "spa_import_rootpool: error %d", error); return (error); } if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", error); return (error); } spa_free_bootprop(zfs_bootfs); if (error = vfs_lock(vfsp)) return (error); if (error = zfs_domount(vfsp, rootfs.bo_name)) { cmn_err(CE_NOTE, "zfs_domount: error %d", error); goto out; } zfsvfs = (zfsvfs_t *)vfsp->vfs_data; ASSERT(zfsvfs); if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { cmn_err(CE_NOTE, "zfs_zget: error %d", error); goto out; } vp = ZTOV(zp); mutex_enter(&vp->v_lock); vp->v_flag |= VROOT; mutex_exit(&vp->v_lock); rootvp = vp; /* * Leave rootvp held. The root file system is never unmounted. */ vfs_add((struct vnode *)0, vfsp, (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); out: vfs_unlock(vfsp); return (error); } else if (why == ROOT_REMOUNT) { readonly_changed_cb(vfsp->vfs_data, B_FALSE); vfsp->vfs_flag |= VFS_REMOUNT; /* refresh mount options */ zfs_unregister_callbacks(vfsp->vfs_data); return (zfs_register_callbacks(vfsp)); } else if (why == ROOT_UNMOUNT) { zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); (void) zfs_sync(vfsp, 0, 0); return (0); } /* * if "why" is equal to anything else other than ROOT_INIT, * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. */ return (SET_ERROR(ENOTSUP)); } #endif /* OPENSOLARIS_MOUNTROOT */ static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; #ifdef illumos if (mvp->v_type != VDIR) return (SET_ERROR(ENOTDIR)); mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } mutex_exit(&mvp->v_lock); /* * ZFS does not support passing unparsed data in via MS_DATA. * Users should use the MS_OPTIONSTR interface; this means * that all option parsing is already done and the options struct * can be interrogated. */ if ((uap->flags & MS_DATA) && uap->datalen > 0) #else /* !illumos */ if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS)) return (SET_ERROR(EPERM)); if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); #endif /* illumos */ /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK(mvp, 0); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK(mvp, 0); goto out; } VOP_UNLOCK(mvp, 0); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curthread) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } #ifdef SECLABEL error = zfs_mount_label_policy(vfsp, osname); if (error) goto out; #endif vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); #ifdef illumos /* * Add an extra VFS_HOLD on our parent vfs so that it can't * disappear due to a forced unmount. */ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) VFS_HOLD(mvp->v_vfsp); #endif out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); #ifdef FREEBSD_NAMECACHE - cache_purgevfs(zfsvfs->z_parent->z_vfs); + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relavent for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the * parent's filesystem's vfsp. Note, 'z_parent' is self * referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) return (EBUSY); ASSERT(zfsvfs->z_ctldir->v_count == 1); } zfsctl_destroy(zfsvfs); ASSERT(zfsvfs->z_ctldir == NULL); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); ASSERT(zfsvfs->z_ctldir != NULL); } return (ret); } #ifdef illumos if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { if (vfsp->vfs_count > 1) return (SET_ERROR(EBUSY)); } else { if (vfsp->vfs_count > 2 || zfsvfs->z_ctldir->v_count > 1) return (SET_ERROR(EBUSY)); } } #endif VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) err = vn_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { *vpp = zfsvfs->z_ctldir; ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else if (object == zfsvfs->z_shares_dir) { VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else { vref(*vpp); } ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags | LK_RETRY); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { int err; znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY0(dmu_objset_hold(osname, zfsvfs, &os)); VERIFY3P(os->os_dsl_dataset->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(os->os_dsl_dataset)); dmu_objset_rele(os, zfsvfs); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; #ifdef illumos /* * If this is a snapshot, we have an extra VFS_HOLD on our parent * from zfs_mount(). Release it here. If we came through * zfs_mountroot() instead, we didn't grab an extra hold, so * skip the VFS_RELE for rootvfs. */ if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { const char *pname; int error = ENOENT; /* * Look up the file system's value for the property. For the * version property, we look up a slightly different string. */ if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; default: return (error); } error = 0; } return (error); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void)strlcpy(fromname, newname, sizeof(mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", newname, fromname + oldlen); (void)strlcpy(fromname, tmpbuf, sizeof(mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 306802) +++ head/sys/kern/vfs_cache.c (revision 306803) @@ -1,2248 +1,2248 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. */ struct namecache_ts { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ char nc_name[0]; /* segment name + nul */ }; /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. * * These locks are used (in the order in which they can be taken): * NAME TYPE ROLE * vnodelock mtx vnode lists and v_cache_dd field protection * bucketlock rwlock for access to given set of hash buckets * ncneg_mtx mtx negative entry LRU management * * Additionally, ncneg_shrink_lock mtx is used to have at most one thread * shrinking the LRU list. * * It is legal to take multiple vnodelock and bucketlock locks. The locking * order is lower address first. Both are recursive. * * "." lookups are lockless. * * ".." and vnode -> name lookups require vnodelock. * * name -> vnode lookup requires the relevant bucketlock to be held for reading. * * Insertions and removals of entries require involved vnodes and bucketlocks * to be write-locked to prevent other threads from seeing the entry. * * Some lookups result in removal of the found entry (e.g. getting rid of a * negative entry with the intent to create a positive one), which poses a * problem when multiple threads reach the state. Similarly, two different * threads can purge two different vnodes and try to remove the same name. * * If the already held vnode lock is lower than the second required lock, we * can just take the other lock. However, in the opposite case, this could * deadlock. As such, this is resolved by trylocking and if that fails unlocking * the first node, locking everything in order and revalidating the state. */ /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); static u_long numneg; /* number of negative entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "Number of negative entries in namecache"); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "Number of namecache entries"); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "Number of namecache entries with vnodes held"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_int ncpurgeminvnodes; SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, "Number of vnodes below which purgevfs ignores the request"); struct nchstats nchstats; /* cache effectiveness statistics */ static struct mtx ncneg_shrink_lock; MTX_SYSINIT(vfscache_shrink_neg, &ncneg_shrink_lock, "Name Cache shrink neg", MTX_DEF); static struct mtx_padalign ncneg_mtx; MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF); static u_int numbucketlocks; static struct rwlock_padalign *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)])) static u_int numvnodelocks; static struct mtx *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { struct mtx *vlp; if (vp == NULL) return (NULL); vlp = &vnodelocks[(((uintptr_t)(vp) >> 8) % numvnodelocks)]; return (vlp); } /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_small_ts; static uma_zone_t cache_zone_large; static uma_zone_t cache_zone_large_ts; #define CACHE_PATH_CUTOFF 35 static struct namecache * cache_alloc(int len, int ts) { if (len > CACHE_PATH_CUTOFF) { if (ts) return (uma_zalloc(cache_zone_large_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_large, M_WAITOK)); } if (ts) return (uma_zalloc(cache_zone_small_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_small, M_WAITOK)); } static void cache_free(struct namecache *ncp) { int ts; if (ncp == NULL) return; ts = ncp->nc_flag & NCF_TS; if ((ncp->nc_flag & NCF_DVDROP) != 0) vdrop(ncp->nc_dvp); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) { if (ts) uma_zfree(cache_zone_small_ts, ncp); else uma_zfree(cache_zone_small, ncp); } else if (ts) uma_zfree(cache_zone_large_ts, ncp); else uma_zfree(cache_zone_large, ncp); } static char * nc_get_name(struct namecache *ncp) { struct namecache_ts *ncp_ts; if ((ncp->nc_flag & NCF_TS) == 0) return (ncp->nc_name); ncp_ts = (struct namecache_ts *)ncp; return (ncp_ts->nc_name); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp != NULL) *tsp = ((struct namecache_ts *)ncp)->nc_time; if (ticksp != NULL) *ticksp = ((struct namecache_ts *)ncp)->nc_ticks; } static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE_ULONG(name, descr) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); #define STATNODE_COUNTER(name, descr) \ static counter_u64_t name; \ SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); STATNODE_ULONG(numneg, "Number of negative cache entries"); STATNODE_ULONG(numcache, "Number of cache entries"); STATNODE_COUNTER(numcalls, "Number of cache lookups"); STATNODE_COUNTER(dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothits, "Number of '..' hits"); STATNODE_COUNTER(numchecks, "Number of checks in lookup"); STATNODE_COUNTER(nummiss, "Number of cache misses"); STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); /* These count for kern___getcwd(), too. */ STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, "Number of times zap_and_exit failed to lock"); static long cache_lock_vnodes_cel_3_failures; STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp, bool neg_locked); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static int cache_yield; SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, "Number of times cache called yield"); static void cache_maybe_yield(void) { if (should_yield()) { cache_yield++; kern_yield(PRI_USER); } } static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); return (hash); } static inline struct rwlock * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp, int mode) { struct rwlock *blp; blp = NCP2BUCKETLOCK(ncp); rw_assert(blp, mode); } #else #define cache_assert_bucket_locked(x, y) do { } while (0) #endif #define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y)) static void _cache_sort(void **p1, void **p2) { void *tmp; if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) rw_wlock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) rw_wunlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort(&vlp1, &vlp2); MPASS(vlp2 != NULL); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) LIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management */ static void cache_negative_hit(struct namecache *ncp) { MPASS(ncp->nc_vp == NULL); mtx_lock(&ncneg_mtx); TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); mtx_unlock(&ncneg_mtx); } static void cache_negative_insert(struct namecache *ncp) { MPASS(ncp->nc_vp == NULL); cache_assert_bucket_locked(ncp, RA_WLOCKED); mtx_lock(&ncneg_mtx); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); numneg++; mtx_unlock(&ncneg_mtx); } static void cache_negative_remove(struct namecache *ncp, bool neg_locked) { MPASS(ncp->nc_vp == NULL); cache_assert_bucket_locked(ncp, RA_WLOCKED); if (!neg_locked) mtx_lock(&ncneg_mtx); else mtx_assert(&ncneg_mtx, MA_OWNED); TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; if (!neg_locked) mtx_unlock(&ncneg_mtx); } static void cache_negative_zap_one(void) { struct namecache *ncp, *ncp2; struct mtx *dvlp; struct rwlock *blp; if (!mtx_trylock(&ncneg_shrink_lock)) return; mtx_lock(&ncneg_mtx); ncp = TAILQ_FIRST(&ncneg); if (ncp == NULL) { mtx_unlock(&ncneg_mtx); goto out; } MPASS(ncp->nc_vp == NULL); dvlp = VP2VNODELOCK(ncp->nc_dvp); blp = NCP2BUCKETLOCK(ncp); mtx_unlock(&ncneg_mtx); mtx_lock(dvlp); rw_wlock(blp); mtx_lock(&ncneg_mtx); ncp2 = TAILQ_FIRST(&ncneg); if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || blp != NCP2BUCKETLOCK(ncp2) || ncp2->nc_vp != NULL) { ncp = NULL; goto out_unlock_all; } cache_zap_locked(ncp, true); out_unlock_all: mtx_unlock(&ncneg_mtx); rw_wunlock(blp); mtx_unlock(dvlp); out: mtx_unlock(&ncneg_shrink_lock); cache_free(ncp); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp, bool neg_locked) { cache_assert_vnode_locked(ncp->nc_vp); cache_assert_vnode_locked(ncp->nc_dvp); cache_assert_bucket_locked(ncp, RA_WLOCKED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); if (ncp->nc_vp != NULL) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, nc_get_name(ncp), ncp->nc_vp); } else { SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, nc_get_name(ncp)); } LIST_REMOVE(ncp, nc_hash); if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) ncp->nc_dvp->v_cache_dd = NULL; } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; atomic_subtract_rel_long(&numcachehv, 1); } } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); if (ncp == ncp->nc_vp->v_cache_dd) ncp->nc_vp->v_cache_dd = NULL; } else { cache_negative_remove(ncp, neg_locked); } atomic_subtract_rel_long(&numcache, 1); } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct rwlock *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_vp == NULL); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct rwlock *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_vp == NULL) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } static int cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct rwlock *blp; int error = 0; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); pvlp = VP2VNODELOCK(vp); if (ncp->nc_vp == NULL) { cache_zap_negative_locked_vnode_kl(ncp, vp); goto out; } blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); cache_sort(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) { error = EAGAIN; goto out; } to_unlock = vlp1; } rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); mtx_unlock(to_unlock); out: mtx_unlock(pvlp); return (error); } static int cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp) { struct mtx *dvlp, *vlp; cache_assert_bucket_locked(ncp, RA_RLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { rw_runlock(blp); rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } rw_runlock(blp); return (EAGAIN); } static int cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, struct mtx **vlpp1, struct mtx **vlpp2) { struct mtx *dvlp, *vlp; cache_assert_bucket_locked(ncp, RA_WLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = VP2VNODELOCK(ncp->nc_vp); cache_sort(&dvlp, &vlp); if (*vlpp1 == dvlp && *vlpp2 == vlp) { cache_zap_locked(ncp, false); cache_unlock_vnodes(dvlp, vlp); *vlpp1 = NULL; *vlpp2 = NULL; return (0); } if (*vlpp1 != NULL) mtx_unlock(*vlpp1); if (*vlpp2 != NULL) mtx_unlock(*vlpp2); *vlpp1 = NULL; *vlpp2 = NULL; if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp, false); cache_unlock_vnodes(dvlp, vlp); return (0); } rw_wunlock(blp); *vlpp1 = dvlp; *vlpp2 = vlp; if (*vlpp1 != NULL) mtx_lock(*vlpp1); mtx_lock(*vlpp2); rw_wlock(blp); return (EAGAIN); } static void cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) { if (blp != NULL) { rw_runlock(blp); mtx_assert(vlp, MA_NOTOWNED); } else { mtx_unlock(vlp); } } /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative caching), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. If the directory vnode is * recycled out from under us due to a forced unmount, a status of * ENOENT is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct rwlock *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error, ltype; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: blp = NULL; dvlp = VP2VNODELOCK(dvp); error = 0; counter_u64_add(numcalls, 1); if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; VREF(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if ((*vpp)->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { counter_u64_add(dotdothits, 1); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); mtx_unlock(dvlp); return (0); } if ((cnp->cn_flags & MAKEENTRY) == 0) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_dvp != dvp) panic("dvp %p v_cache_dd %p\n", dvp, ncp); if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { dvp->v_cache_dd = NULL; mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) *vpp = ncp->nc_vp; else *vpp = ncp->nc_dvp; /* Return failure if negative entry was found. */ if (*vpp == NULL) goto negative_success; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) *tsp = ((struct namecache_ts *)ncp)-> nc_dotdottime; goto success; } } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); rw_rlock(blp); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(nummisszap, 1); } else { counter_u64_add(nummiss, 1); } goto unlock; } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(numposzaps, 1); goto zap_and_exit; } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp), *vpp); cache_out_ts(ncp, tsp, ticksp); goto success; } negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { counter_u64_add(numnegzaps, 1); goto zap_and_exit; } counter_u64_add(numneghits, 1); cache_negative_hit(ncp); if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, nc_get_name(ncp)); cache_out_ts(ncp, tsp, ticksp); cache_lookup_unlock(blp, dvlp); return (ENOENT); success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ MPASS(dvp != *vpp); ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } vhold(*vpp); cache_lookup_unlock(blp, dvlp); error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); if (dvp->v_iflag & VI_DOOMED) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } } if (error) { *vpp = NULL; goto retry; } if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); } return (-1); unlock: cache_lookup_unlock(blp, dvlp); return (0); zap_and_exit: if (blp != NULL) error = cache_zap_rlocked_bucket(ncp, blp); else error = cache_zap_locked_vnode(ncp, dvp); if (error != 0) { zap_and_exit_bucket_fail++; cache_maybe_yield(); goto retry; } cache_free(ncp); return (0); } struct celockstate { struct mtx *vlp[3]; struct rwlock *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); vlp = VP2VNODELOCK(vp); if (vlp == NULL) return (true); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_lock_vnodes_cel_3_failures++; cache_unlock_vnodes_cel(cel); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, struct rwlock *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort(&blp1, &blp2); if (blp1 != NULL) { rw_wlock(blp1); cel->blp[0] = blp1; } rw_wlock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) rw_wunlock(cel->blp[0]); rw_wunlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. If the * entry is negative, ncelock is locked instead of the vnode. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct rwlock *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = vp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct rwlock *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = dvp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *n3; struct nchashhead *ncpp; uint32_t hash; int flag; int len; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cache_enter: Adding a doomed vnode")); VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, ("cache_enter: Doomed vnode used as src")); if (!doingcache) return; /* * Avoid blowout in namecache entries. */ if (numcache >= desiredvnodes * ncsizefactor) return; cache_celockstate_init(&cel); ndd = NULL; flag = 0; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { len = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); /* * If dotdot entry already exists, just retarget it * to new parent vnode, otherwise continue with new * namecache entry allocation. */ if ((ncp = dvp->v_cache_dd) != NULL && ncp->nc_flag & NCF_ISDOTDOT) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); if (ncp->nc_vp != NULL) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); } else { cache_negative_remove(ncp, false); } if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); } else { cache_negative_insert(ncp); } ncp->nc_vp = vp; cache_enter_unlock(&cel); return; } dvp->v_cache_dd = NULL; cache_enter_unlock(&cel); cache_celockstate_init(&cel); SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); flag = NCF_ISDOTDOT; } } /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); ncp->nc_vp = vp; ncp->nc_dvp = dvp; ncp->nc_flag = flag; if (tsp != NULL) { n3 = (struct namecache_ts *)ncp; n3->nc_time = *tsp; n3->nc_ticks = ticks; n3->nc_flag |= NCF_TS; if (dtsp != NULL) { n3->nc_dotdottime = *dtsp; n3->nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1); cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) { if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n3 = (struct namecache_ts *)n2; n3->nc_time = ((struct namecache_ts *)ncp)->nc_time; n3->nc_ticks = ((struct namecache_ts *)ncp)->nc_ticks; if (dtsp != NULL) { n3->nc_dotdottime = ((struct namecache_ts *)ncp)-> nc_dotdottime; n3->nc_flag |= NCF_DTS; } } goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); dvp->v_cache_dd = ncp; } atomic_add_rel_long(&numcache, 1); if (vp != NULL) { if (vp->v_type == VDIR) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd, false); else ndd = NULL; } vp->v_cache_dd = ncp; } } else { vp->v_cache_dd = NULL; } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); atomic_add_rel_long(&numcachehv, 1); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ LIST_INSERT_HEAD(ncpp, ncp, nc_hash); /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), vp); } else { if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; cache_negative_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, nc_get_name(ncp)); } cache_enter_unlock(&cel); if (numneg * ncnegfactor > numcache) cache_negative_zap_one(); cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); return; } static u_int cache_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; TAILQ_INIT(&ncneg); cache_zone_small = uma_zcreate("S VFS Cache", sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", sizeof(struct namecache) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", sizeof(struct namecache_ts) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); numbucketlocks = cache_roundup_2(mp_ncpus * 64); bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); numvnodelocks = cache_roundup_2(mp_ncpus * 64); vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); ncpurgeminvnodes = numbucketlocks; numcalls = counter_u64_alloc(M_WAITOK); dothits = counter_u64_alloc(M_WAITOK); dotdothits = counter_u64_alloc(M_WAITOK); numchecks = counter_u64_alloc(M_WAITOK); nummiss = counter_u64_alloc(M_WAITOK); nummisszap = counter_u64_alloc(M_WAITOK); numposzaps = counter_u64_alloc(M_WAITOK); numposhits = counter_u64_alloc(M_WAITOK); numnegzaps = counter_u64_alloc(M_WAITOK); numneghits = counter_u64_alloc(M_WAITOK); numfullpathcalls = counter_u64_alloc(M_WAITOK); numfullpathfail1 = counter_u64_alloc(M_WAITOK); numfullpathfail2 = counter_u64_alloc(M_WAITOK); numfullpathfail4 = counter_u64_alloc(M_WAITOK); numfullpathfound = counter_u64_alloc(M_WAITOK); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_changesize(int newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl; u_long new_nchash, old_nchash; struct namecache *ncp; uint32_t hash; int i; new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { free(new_nchashtbl, M_VFSCACHE); return; } /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); LIST_REMOVE(ncp, nc_hash); LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } cache_unlock_all_buckets(); cache_unlock_all_vnodes(); free(old_nchashtbl, M_VFSCACHE); } /* * Invalidate all entries to a particular vnode. */ void cache_purge(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp, *vlp2; CTR1(KTR_VFS, "cache_purge(%p)", vp); SDT_PROBE1(vfs, namecache, purge, done, vp); if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && vp->v_cache_dd == NULL) return; TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp; CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); SDT_PROBE1(vfs, namecache, purge_negative, done, vp); TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (ncp->nc_vp != NULL) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } mtx_unlock(vlp); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Flush all entries referencing a particular filesystem. */ void -cache_purgevfs(struct mount *mp) +cache_purgevfs(struct mount *mp, bool force) { TAILQ_HEAD(, namecache) ncps; struct mtx *vlp1, *vlp2; struct rwlock *blp; struct nchashhead *bucket; struct namecache *ncp, *nnp; u_long i, j, n_nchash; int error; /* Scan hash tables for applicable entries */ SDT_PROBE1(vfs, namecache, purgevfs, done, mp); - if (mp->mnt_nvnodelistsize <= ncpurgeminvnodes) + if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) return; TAILQ_INIT(&ncps); n_nchash = nchash + 1; vlp1 = vlp2 = NULL; for (i = 0; i < numbucketlocks; i++) { blp = (struct rwlock *)&bucketlocks[i]; rw_wlock(blp); for (j = i; j < n_nchash; j += numbucketlocks) { retry: bucket = &nchashtbl[j]; LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { cache_assert_bucket_locked(ncp, RA_WLOCKED); if (ncp->nc_dvp->v_mount != mp) continue; error = cache_zap_wlocked_bucket_kl(ncp, blp, &vlp1, &vlp2); if (error != 0) goto retry; TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); } } rw_wunlock(blp); if (vlp1 == NULL && vlp2 == NULL) cache_maybe_yield(); } if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, MAXPATHLEN)); } int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, u_int path_max) { char *bp, *tmpbuf; struct filedesc *fdp; struct vnode *cdir, *rdir; int error; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > path_max) buflen = path_max; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); cdir = fdp->fd_cdir; VREF(cdir); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); vrele(rdir); vrele(cdir); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(bp); #endif } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; struct vnode *rdir; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); vrele(rdir); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, nc_get_name(ncp), vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (dvp->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { int error, slash_prefixed; #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; buflen--; buf[buflen] = '\0'; error = 0; slash_prefixed = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); vref(vp); if (vp->v_type != VDIR) { error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) return (error); if (buflen == 0) { vrele(vp); return (ENOMEM); } buf[--buflen] = '/'; slash_prefixed = 1; } while (vp != rdir && vp != rootvnode) { if (vp->v_vflag & VV_ROOT) { if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vp1 = vp->v_mount->mnt_vnodecovered; vref(vp1); vrele(vp); vp = vp1; continue; } if (vp->v_type != VDIR) { vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = 1; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); *retbuf = buf + buflen; return (0); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vhold(ddvp); mtx_unlock(vlp); if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, nc_get_name(ncp), l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* ABI compat shims for old kernel modules. */ #undef cache_enter void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { cache_enter_time(dvp, vp, cnp, NULL, NULL); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If sysctl debug.disablefullpath is set, ENODEV is returned, * vnode is left locked and path remain untouched. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Return ENODEV if sysctl debug.disablefullpath==1 */ if (disablefullpath) return (ENODEV); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp, 0); error = vn_fullpath_global(td, vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE(&nd, NDF_ONLY_PNBUF); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 306802) +++ head/sys/kern/vfs_mount.c (revision 306803) @@ -1,2003 +1,2003 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ int sys_nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_activevnodelist); mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_activevnodelistsize != 0) panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) fsflags &= ~MNT_RDONLY; else if (strcmp(opt->name, "rw") == 0) fsflags &= ~MNT_RDONLY; else if (strcmp(opt->name, "ro") == 0) fsflags |= MNT_RDONLY; else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; vfs_freeopt(optlist, opt); } } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (vfsp->vfc_vfsops->vfs_cmount == NULL) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags); return (error); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp; int error; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp, 0); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); if (error != 0) { vfs_unbusy(mp); vfs_mount_destroy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) panic("mount: lost mount"); VOP_UNLOCK(vp, 0); EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp, 0); mountcheckdirs(vp, newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; void *bufp; struct mount *mp; int error, export_error, len; uint64_t flag; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&export, sizeof(export)); /* FALLTHROUGH */ case (sizeof(export)): bcopy(bufp, &export, len); export_error = vfs_export(mp, &export); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL)) return (EPERM); } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); /* debug.disablefullpath == 1 results in ENODEV */ if (error == 0 || error == ENODEV) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(uap->flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (uap->flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0 || error == ENODEV) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, uap->flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp, *fsrootvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vfs_rel(mp); return (error); } vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vn_finished_write(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ; if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ); if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vn_finished_write(mp); return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); /* * From now, we can claim that the use reference on the * coveredvp is ours, and the ref can be released only by * successfull unmount by us, or left for later unmount * attempt. The previously acquired hold reference is no * longer needed to protect the vnode from reuse. */ if (coveredvp != NULL) vdrop(coveredvp); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); - cache_purgevfs(mp); /* remove cache entries for this file sys */ + cache_purgevfs(mp, false); /* remove cache entries for this file sys */ vfs_deallocate_syncvnode(mp); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL && (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 || (error = VFS_SYNC(mp, MNT_WAIT)) == 0) error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL && (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ; if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vput(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; case 'g': case 'G': iv *= 1024; case 'm': case 'M': iv *= 1024; case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp) { bcopy(oexp, exp, sizeof(*oexp)); exp->ex_numsecflavors = 0; } Index: head/sys/kern/vfs_mountroot.c =================================================================== --- head/sys/kern/vfs_mountroot.c (revision 306802) +++ head/sys/kern/vfs_mountroot.c (revision 306803) @@ -1,1105 +1,1105 @@ /*- * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); LIST_FOREACH(h, &root_holds, list) { if (h != LIST_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; if (root_mounted()) return (NULL); h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&root_holds_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL) return; mtx_lock(&root_holds_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); free(h, M_DEVBUF); } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("Cannot find root vnode"); VOP_UNLOCK(rootvnode, 0); p = curthread->td_proc; FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_XUNLOCK(p->p_fd); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); - cache_purgevfs(mporoot); + cache_purgevfs(mporoot, true); if (mporoot != mpdevfs) - cache_purgevfs(mpdevfs); + cache_purgevfs(mpdevfs, true); VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vporoot); vporoot->v_mountedhere = NULL; mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); - cache_purgevfs(rootvnode->v_mount); + cache_purgevfs(rootvnode->v_mount, true); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); } if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); mporoot->mnt_vnodecovered = vp; vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VOP_UNLOCK(vp, 0); } else vput(vp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); vpdevfs->v_mountedhere = NULL; vrele(vpdevfs); } mpdevfs->mnt_vnodecovered = vp; vp->v_mountedhere = mpdevfs; VOP_UNLOCK(vp, 0); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); NDFREE(&nd, NDF_ONLY_PNBUF); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_unlinkat(td, AT_FDCWD, "/dev/dev", UIO_SYSSPACE, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:tank\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); PICKUP_GIANT(); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); error = kern_close(td, fd); out: free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static int parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread); error = namei(&nd); if (!error) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error != 0) ? 0 : 1; } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int error; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; sbuf_printf(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) sbuf_printf(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); sbuf_printf(sb, ".timeout 0\n"); sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) sbuf_printf(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct timeval lastfail; int curfail; curfail = 0; while (1) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&root_holds_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); } } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0') { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } if( strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); } Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 306802) +++ head/sys/sys/vnode.h (revision 306803) @@ -1,884 +1,884 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ const char *v_tag; /* u type of underlying data */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. * See #defines below for renaming to v_* namespace. */ union { struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ struct socket *vu_socket; /* v unix domain net (VSOCK) */ struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ } v_un; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ u_int v_mflag; /* l mnt-specific vnode flags */ int v_writecount; /* v ref count of writers */ u_int v_hash; enum vtype v_type; /* u vnode type */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define v_mountedhere v_un.vu_mount #define v_socket v_un.vu_socket #define v_rdev v_un.vu_cdev #define v_fifoinfo v_un.vu_fifoinfo #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* socket, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VI_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_ACTIVE 0x0200 /* This vnode is on the active list */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* filesystem id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(int newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_purge(struct vnode *vp); void cache_purge_negative(struct vnode *vp); -void cache_purgevfs(struct mount *mp); +void cache_purgevfs(struct mount *mp, bool force); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(u_int count); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen); int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); #define vdrop(vp) _vdrop((vp), 0) #define vdropl(vp) _vdrop((vp), 1) void _vdrop(struct vnode *, bool); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); #define vhold(vp) _vhold((vp), 0) #define vholdl(vp) _vhold((vp), 1) void _vhold(struct vnode *, bool); void vinactive(struct vnode *, struct thread *); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_post(void *a, int rc); void vop_deleteextattr_post(void *a, int rc); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_setextattr_post(void *a, int rc); void vop_symlink_post(void *a, int rc); #ifdef DEBUG_VFS_LOCKS void vop_strategy_pre(void *a); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); #else #define vop_strategy_pre(x) do { } while (0) #define vop_lock_pre(x) do { } while (0) #define vop_lock_post(x, y) do { } while (0) #define vop_unlock_post(x, y) do { } while (0) #define vop_unlock_pre(x) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefl(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(int newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct ucred *cred); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */