Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 310958) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 310959) @@ -1,2517 +1,2517 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (sys_shutdown && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } #ifndef __FreeBSD_kernel__ static int zfs_create_unique_device(dev_t *dev) { major_t new_major; do { ASSERT3U(zfs_minor, <=, MAXMIN32); minor_t start = zfs_minor; do { mutex_enter(&zfs_dev_mtx); if (zfs_minor >= MAXMIN32) { /* * If we're still using the real major * keep out of /dev/zfs and /dev/zvol minor * number space. If we're using a getudev()'ed * major number, we can use all of its minors. */ if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) zfs_minor = ZFS_MIN_MINOR; else zfs_minor = 0; } else { zfs_minor++; } *dev = makedevice(zfs_major, zfs_minor); mutex_exit(&zfs_dev_mtx); } while (vfs_devismounted(*dev) && zfs_minor != start); if (zfs_minor == start) { /* * We are using all ~262,000 minor numbers for the * current major number. Create a new major number. */ if ((new_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "zfs_mount: Can't get unique major " "device number."); return (-1); } mutex_enter(&zfs_dev_mtx); zfs_major = new_major; zfs_minor = 0; mutex_exit(&zfs_dev_mtx); } else { break; } /* CONSTANTCONDITION */ } while (1); return (0); } #endif /* !__FreeBSD_kernel__ */ static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; #ifdef illumos boolean_t devices = B_FALSE; boolean_t do_devices = B_FALSE; #endif boolean_t xattr = B_FALSE; boolean_t do_xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_config_sync() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); #ifdef illumos error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); #endif error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (SET_ERROR(ENOENT)); /* * If we have a NULL data pointer * then assume the id's aren't changing and * return EEXIST to the dmu to let it know to * use the same ids */ if (data == NULL) return (SET_ERROR(EEXIST)); if (bonustype == DMU_OT_ZNODE) { znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; sa_hdr_phys_t *sap = data; sa_hdr_phys_t sa = *sap; boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled * in yet. */ *userp = 0; *groupp = 0; return (0); } if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; } else { VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); if (swap) { *userp = BSWAP_64(*userp); *groupp = BSWAP_64(*groupp); } } return (0); } static void fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, char *domainbuf, int buflen, uid_t *ridp) { uint64_t fuid; const char *domain; fuid = strtonum(fuidstr, NULL); domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); if (domain) (void) strlcpy(domainbuf, domain, buflen); else domainbuf[0] = '\0'; *ridp = FUID_RID(fuid); } static uint64_t zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) { switch (type) { case ZFS_PROP_USERUSED: return (DMU_USERUSED_OBJECT); case ZFS_PROP_GROUPUSED: return (DMU_GROUPUSED_OBJECT); case ZFS_PROP_USERQUOTA: return (zfsvfs->z_userquota_obj); case ZFS_PROP_GROUPQUOTA: return (zfsvfs->z_groupquota_obj); } return (0); } int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) { int error; zap_cursor_t zc; zap_attribute_t za; zfs_useracct_t *buf = vbuf; uint64_t obj; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) { *bufsizep = 0; return (0); } for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > *bufsizep) break; fuidstr_to_sid(zfsvfs, za.za_name, buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); buf->zu_space = za.za_first_integer; buf++; } if (error == ENOENT) error = 0; ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; *cookiep = zap_cursor_serialize(&zc); zap_cursor_fini(&zc); return (error); } /* * buf must be big enough (eg, 32 bytes) */ static int id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, char *buf, boolean_t addok) { uint64_t fuid; int domainid = 0; if (domain && domain[0]) { domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); if (domainid == -1) return (SET_ERROR(ENOENT)); } fuid = FUID_ENCODE(domainid, rid); (void) sprintf(buf, "%llx", (longlong_t)fuid); return (0); } int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valp) { char buf[32]; int err; uint64_t obj; *valp = 0; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) return (0); err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); if (err) return (err); err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); if (err == ENOENT) err = 0; return (err); } int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota) { char buf[32]; int err; dmu_tx_t *tx; uint64_t *objp; boolean_t fuid_dirtied; if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) return (SET_ERROR(EINVAL)); if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) return (SET_ERROR(ENOTSUP)); objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : &zfsvfs->z_groupquota_obj; err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); if (err) return (err); fuid_dirtied = zfsvfs->z_fuid_dirty; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); if (*objp == 0) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, zfs_userquota_prop_prefixes[type]); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&zfsvfs->z_lock); if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); dmu_tx_commit(tx); return (err); } boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; int err; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); (void) sprintf(buf, "%llx", (longlong_t)fuid); err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); if (err != 0) return (B_FALSE); err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); if (err != 0) return (B_FALSE); return (used >= quota); } boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { uint64_t fuid; uint64_t quotaobj; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; fuid = isgroup ? zp->z_gid : zp->z_uid; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } int zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); /* * We claim to always be readonly so we can open snapshots; * other ZPL code will prevent us from writing to snapshots. */ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); if (error) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); #ifdef DIAGNOSTIC rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); #else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); #endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; else zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; /* * This is a barrier to prevent the filesystem from going away in * zfs_znode_move() until we can safely ensure that the filesystem is * not unmounted. We consider the filesystem valid before the barrier * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; vnode_t *vp; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; #ifdef illumos /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; if (zfs_create_unique_device(&mount_dev) == -1) { error = SET_ERROR(ENODEV); goto out; } ASSERT(vfs_devismounted(mount_dev) == 0); #endif if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | vfsp->mnt_vfc->vfc_typenum & 0xFF; /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef SECLABEL /* * Convert a decimal digit string to a uint64_t integer. */ static int str_to_uint64(char *str, uint64_t *objnum) { uint64_t num = 0; while (*str) { if (*str < '0' || *str > '9') return (SET_ERROR(EINVAL)); num = num*10 + *str++ - '0'; } *objnum = num; return (0); } /* * The boot path passed from the boot loader is in the form of * "rootpool-name/root-filesystem-object-number'. Convert this * string to a dataset name: "rootpool-name/root-filesystem-name". */ static int zfs_parse_bootfs(char *bpath, char *outpath) { char *slashp; uint64_t objnum; int error; if (*bpath == 0 || *bpath == '/') return (SET_ERROR(EINVAL)); (void) strcpy(outpath, bpath); slashp = strchr(bpath, '/'); /* if no '/', just return the pool name */ if (slashp == NULL) { return (0); } /* if not a number, just return the root dataset name */ if (str_to_uint64(slashp+1, &objnum)) { return (0); } *slashp = '\0'; error = dsl_dsobj_to_dsname(bpath, objnum, outpath); *slashp = '/'; return (error); } /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : EACCES); } return (SET_ERROR(EACCES)); } /* * Determine whether the mount is allowed according to MAC check. * by comparing (where appropriate) label of the dataset against * the label of the zone being mounted into. If the dataset has * no label, create one. * * Returns 0 if access allowed, error otherwise (e.g. EACCES) */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) { int error, retv; zone_t *mntzone = NULL; ts_label_t *mnt_tsl; bslabel_t *mnt_sl; bslabel_t ds_sl; char ds_hexsl[MAXNAMELEN]; retv = EACCES; /* assume the worst */ /* * Start by getting the dataset label if it exists. */ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) return (SET_ERROR(EACCES)); /* * If labeling is NOT enabled, then disallow the mount of datasets * which have a non-default label already. No other label checks * are needed. */ if (!is_system_labeled()) { if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); return (SET_ERROR(EACCES)); } /* * Get the label of the mountpoint. If mounting into the global * zone (i.e. mountpoint is not within an active zone and the * zoned property is off), the label must be default or * admin_low/admin_high only; no other checks are needed. */ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); if (mntzone->zone_id == GLOBAL_ZONEID) { uint64_t zoned; zone_rele(mntzone); if (dsl_prop_get_integer(osname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EACCES)); if (!zoned) return (zfs_check_global_label(osname, ds_hexsl)); else /* * This is the case of a zone dataset being mounted * initially, before the zone has been fully created; * allow this mount into global zone. */ return (0); } mnt_tsl = mntzone->zone_slabel; ASSERT(mnt_tsl != NULL); label_hold(mnt_tsl); mnt_sl = label2bslabel(mnt_tsl); if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { /* * The dataset doesn't have a real label, so fabricate one. */ char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && dsl_prop_set_string(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), ZPROP_SRC_LOCAL, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { /* * Now compare labels to complete the MAC check. If the * labels are equal then allow access. If the mountpoint * label dominates the dataset label, allow readonly access. * Otherwise, access is denied. */ if (blequal(mnt_sl, &ds_sl)) retv = 0; else if (bldominates(mnt_sl, &ds_sl)) { vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); retv = 0; } } label_rele(mnt_tsl); zone_rele(mntzone); return (retv); } #endif /* SECLABEL */ #ifdef OPENSOLARIS_MOUNTROOT static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { int error = 0; static int zfsrootdone = 0; zfsvfs_t *zfsvfs = NULL; znode_t *zp = NULL; vnode_t *vp = NULL; char *zfs_bootfs; char *zfs_devid; ASSERT(vfsp); /* * The filesystem that we mount as root is defined in the * boot property "zfs-bootfs" with a format of * "poolname/root-dataset-objnum". */ if (why == ROOT_INIT) { if (zfsrootdone++) return (SET_ERROR(EBUSY)); /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do * something better by looking at the timestamp on * an uberblock, so just set it to -1. */ clkset(-1); if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { cmn_err(CE_NOTE, "spa_get_bootfs: can not get " "bootfs name"); return (SET_ERROR(EINVAL)); } zfs_devid = spa_get_bootprop("diskdevid"); error = spa_import_rootpool(rootfs.bo_name, zfs_devid); if (zfs_devid) spa_free_bootprop(zfs_devid); if (error) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "spa_import_rootpool: error %d", error); return (error); } if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", error); return (error); } spa_free_bootprop(zfs_bootfs); if (error = vfs_lock(vfsp)) return (error); if (error = zfs_domount(vfsp, rootfs.bo_name)) { cmn_err(CE_NOTE, "zfs_domount: error %d", error); goto out; } zfsvfs = (zfsvfs_t *)vfsp->vfs_data; ASSERT(zfsvfs); if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { cmn_err(CE_NOTE, "zfs_zget: error %d", error); goto out; } vp = ZTOV(zp); mutex_enter(&vp->v_lock); vp->v_flag |= VROOT; mutex_exit(&vp->v_lock); rootvp = vp; /* * Leave rootvp held. The root file system is never unmounted. */ vfs_add((struct vnode *)0, vfsp, (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); out: vfs_unlock(vfsp); return (error); } else if (why == ROOT_REMOUNT) { readonly_changed_cb(vfsp->vfs_data, B_FALSE); vfsp->vfs_flag |= VFS_REMOUNT; /* refresh mount options */ zfs_unregister_callbacks(vfsp->vfs_data); return (zfs_register_callbacks(vfsp)); } else if (why == ROOT_UNMOUNT) { zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); (void) zfs_sync(vfsp, 0, 0); return (0); } /* * if "why" is equal to anything else other than ROOT_INIT, * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. */ return (SET_ERROR(ENOTSUP)); } #endif /* OPENSOLARIS_MOUNTROOT */ static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; #ifdef illumos if (mvp->v_type != VDIR) return (SET_ERROR(ENOTDIR)); mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } mutex_exit(&mvp->v_lock); /* * ZFS does not support passing unparsed data in via MS_DATA. * Users should use the MS_OPTIONSTR interface; this means * that all option parsing is already done and the options struct * can be interrogated. */ if ((uap->flags & MS_DATA) && uap->datalen > 0) #else /* !illumos */ if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS)) return (SET_ERROR(EPERM)); if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); #endif /* illumos */ /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK(mvp, 0); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK(mvp, 0); goto out; } VOP_UNLOCK(mvp, 0); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curthread) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } #ifdef SECLABEL error = zfs_mount_label_policy(vfsp, osname); if (error) goto out; #endif vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); #ifdef illumos /* * Add an extra VFS_HOLD on our parent vfs so that it can't * disappear due to a forced unmount. */ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) VFS_HOLD(mvp->v_vfsp); #endif out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); #ifdef FREEBSD_NAMECACHE - cache_purgevfs(zfsvfs->z_parent->z_vfs); + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relavent for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the * parent's filesystem's vfsp. Note, 'z_parent' is self * referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) return (EBUSY); ASSERT(zfsvfs->z_ctldir->v_count == 1); } zfsctl_destroy(zfsvfs); ASSERT(zfsvfs->z_ctldir == NULL); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); ASSERT(zfsvfs->z_ctldir != NULL); } return (ret); } #ifdef illumos if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { if (vfsp->vfs_count > 1) return (SET_ERROR(EBUSY)); } else { if (vfsp->vfs_count > 2 || zfsvfs->z_ctldir->v_count > 1) return (SET_ERROR(EBUSY)); } } #endif VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) err = vn_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { *vpp = zfsvfs->z_ctldir; ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else if (object == zfsvfs->z_shares_dir) { VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else { vref(*vpp); } ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags | LK_RETRY); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); VERIFY0(dmu_objset_from_ds(ds, &os)); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; #ifdef illumos /* * If this is a snapshot, we have an extra VFS_HOLD on our parent * from zfs_mount(). Release it here. If we came through * zfs_mountroot() instead, we didn't grab an extra hold, so * skip the VFS_RELE for rootvfs. */ if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { const char *pname; int error = ENOENT; /* * Look up the file system's value for the property. For the * version property, we look up a slightly different string. */ if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; default: return (error); } error = 0; } return (error); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void)strlcpy(fromname, newname, sizeof(mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", newname, fromname + oldlen); (void)strlcpy(fromname, tmpbuf, sizeof(mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: stable/11/sys/kern/subr_witness.c =================================================================== --- stable/11/sys/kern/subr_witness.c (revision 310958) +++ stable/11/sys/kern/subr_witness.c (revision 310959) @@ -1,3018 +1,3026 @@ /*- * Copyright (c) 2008 Isilon Systems, Inc. * Copyright (c) 2008 Ilya Maykov * Copyright (c) 1998 Berkeley Software Design, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_stack.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #if !defined(DDB) && !defined(STACK) #error "DDB or STACK options are required for WITNESS" #endif /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ #define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */ /* Define this to check for blessed mutexes */ #undef BLESSING #ifndef WITNESS_COUNT #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (1024 + MAXCPU) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 /* Prime, gives load factor of ~2 at full load */ #define WITNESS_LO_HASH_SIZE 1021 /* * XXX: This is somewhat bogus, as we assume here that at most 2048 threads * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_NCHILDREN 5 #define LOCK_CHILDCOUNT 2048 #define MAX_W_NAME 64 #define FULLGRAPH_SBUF_SIZE 512 /* * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ #define WITNESS_UNRELATED 0x00 /* No lock order relation. */ #define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ #define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ #define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ #define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ #define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) #define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) #define WITNESS_RELATED_MASK \ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) #define WITNESS_REVERSAL 0x10 /* A lock order reversal has been * observed. */ #define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ #define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ #define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) /* Ancestor to descendant flags */ #define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) #define WITNESS_INDEX_ASSERT(i) \ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count) static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; /* * The main witness structure. One of these per named lock type in the system * (for example, "vnode interlock"). */ struct witness { char w_name[MAX_W_NAME]; uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness *w_hash_next; /* Linked list in hash buckets. */ const char *w_file; /* File where last acquired */ uint32_t w_line; /* Line where last acquired */ uint32_t w_refcount; uint16_t w_num_ancestors; /* direct/indirect * ancestor count */ uint16_t w_num_descendants; /* direct/indirect * descendant count */ int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; STAILQ_HEAD(witness_list, witness); /* * The witness hash table. Keys are witness names (const char *), elements are * witness objects (struct witness *). */ struct witness_hash { struct witness *wh_array[WITNESS_HASH_SIZE]; uint32_t wh_size; uint32_t wh_count; }; /* * Key type for the lock order data hash table. */ struct witness_lock_order_key { uint16_t from; uint16_t to; }; struct witness_lock_order_data { struct stack wlod_stack; struct witness_lock_order_key wlod_key; struct witness_lock_order_data *wlod_next; }; /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; u_int wloh_size; u_int wloh_count; }; #ifdef BLESSING struct witness_blessed { const char *b_lock1; const char *b_lock2; }; #endif struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; }; struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; /* * Returns 0 if one of the locks is a spin lock and the other is not. * Returns 1 otherwise. */ static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { return (a->from == b->from && a->to == b->to); } static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname); static void adopt(struct witness *parent, struct witness *child); #ifdef BLESSING static int blessed(struct witness *, struct witness *); #endif static void depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static struct lock_instance *find_instance(struct lock_list_entry *list, const struct lock_object *lock); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static void itismychild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS); static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent); #ifdef DDB static void witness_ddb_compute_levels(void); static void witness_ddb_display(int(*)(const char *fmt, ...)); static void witness_ddb_display_descendants(int(*)(const char *fmt, ...), struct witness *, int indent); static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_ddb_level_descendants(struct witness *parent, int l); static void witness_ddb_list(struct thread *td); #endif static void witness_debugger(int cond, const char *msg); static void witness_free(struct witness *m); static struct witness *witness_get(void); static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size); static struct witness *witness_hash_get(const char *key); static void witness_hash_put(struct witness *w); static void witness_init_hash_tables(void); static void witness_increment_graph_generation(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_list_entry *witness_lock_list_get(void); static int witness_lock_order_add(struct witness *parent, struct witness *child); static int witness_lock_order_check(struct witness *parent, struct witness *child); static struct witness_lock_order_data *witness_lock_order_get( struct witness *parent, struct witness *child); static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)); static int witness_output(const char *fmt, ...) __printflike(1, 2); static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0); static void witness_setflag(struct lock_object *lock, int flag, int set); static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL, "Witness Locking"); /* * If set to 0, lock order checking is disabled. If set to -1, * witness is completely disabled. Otherwise witness performs full * lock order checking for all locks. At runtime, lock order checking * may be toggled. However, witness cannot be reenabled once it is * completely disabled. */ static int witness_watch = 1; SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #endif /* KDB */ #if defined(DDB) || defined(KDB) /* * When DDB or KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); int badstack_sbuf_size; int witness_count = WITNESS_COUNT; SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* * Output channel for witness messages. By default we print to the console. */ enum witness_channel { WITNESS_CONSOLE, WITNESS_LOG, WITNESS_NONE, }; static enum witness_channel witness_channel = WITNESS_CONSOLE; SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING | CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A", "Output channel for warnings"); /* * Call this to print out the relations between locks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs"); /* * Call this to print out the witness faulty stacks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks"); static struct mtx w_mtx; /* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); /* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); /* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); static struct witness *w_data; static uint8_t **w_rmatrix; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_hash w_hash; /* The witness hash table. */ /* The lock order data hash */ static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT]; static struct witness_lock_order_data *w_lofree = NULL; static struct witness_lock_order_hash w_lohash; static int w_max_used_index = 0; static unsigned int w_generation = 0; static const char w_notrunning[] = "Witness not running\n"; static const char w_stillcold[] = "Witness is still cold\n"; static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { "time lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * umtx */ { "umtx lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_rw }, { "rtentry", &lock_class_mtx_sleep }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv4 multicast: * protocol locks before interface locks, after UDP locks. */ { "udpinp", &lock_class_rw }, { "in_multi_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_rw }, { NULL, NULL }, /* * IPv6 multicast: * protocol locks before interface locks, after UDP locks. */ { "udpinp", &lock_class_rw }, { "in6_multi_mtx", &lock_class_mtx_sleep }, { "mld_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_rw }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp_link_rwlock", &lock_class_rw }, { "unp_list_lock", &lock_class_mtx_sleep }, { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ { "udp", &lock_class_rw }, { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ { "tcp", &lock_class_rw }, { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_mtx_sleep }, { "bpf interface lock", &lock_class_rw }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VM */ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* + * VFS namecache + */ + { "ncvn", &lock_class_mtx_sleep }, + { "ncbuc", &lock_class_rw }, + { "vnode interlock", &lock_class_mtx_sleep }, + { "ncneg", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, { "dr->dt.di.dr_mtx", &lock_class_sx }, { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, { "sio", &lock_class_mtx_spin }, { "scrlock", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif #ifdef __sparc64__ { "pcib_mtx", &lock_class_mtx_spin }, { "rtc_mtx", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { "process slock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #if defined(SMP) && defined(__sparc64__) { "ipi", &lock_class_mtx_spin }, #endif #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; #ifdef BLESSING /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; #endif /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ static void witness_initialize(void *dummy __unused) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; int i; w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS, M_WAITOK | M_ZERO); w_rmatrix = malloc(sizeof(*w_rmatrix) * (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO); for (i = 0; i < witness_count + 1; i++) { w_rmatrix[i] = malloc(sizeof(*w_rmatrix[i]) * (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO); } badstack_sbuf_size = witness_count * 256; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = witness_count - 1; i >= 0; i--) { w = &w_data[i]; memset(w, 0, sizeof(*w)); w_data[i].w_index = i; /* Witness index never changes. */ witness_free(w); } KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, ("%s: Invalid list of free witness objects", __func__)); /* Witness with index 0 is not used to aid in debugging. */ STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; for (i = 0; i < witness_count; i++) { memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); witness_init_hash_tables(); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; itismychild(w, w1); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ for (i = 0; pending_locks[i].wh_lock != NULL; i++) { lock = pending_locks[i].wh_lock; KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(pending_locks[i].wh_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL); void witness_init(struct lock_object *lock, const char *type) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) kassert_panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) kassert_panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) kassert_panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch < 1 || panicstr != NULL || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, " "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) return; w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; if (w->w_refcount == 0) depart(w); mtx_unlock_spin(&w_mtx); } #ifdef DDB static void witness_ddb_compute_levels(void) { struct witness *w; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_ddb_level = -1; /* * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* If the witness has ancestors (is not a root), skip it. */ if (w->w_num_ancestors > 0) continue; witness_ddb_level_descendants(w, 0); } } static void witness_ddb_level_descendants(struct witness *w, int l) { int i; if (w->w_ddb_level >= l) return; w->w_ddb_level = l; l++; for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_level_descendants(&w_data[i], l); } } static void witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), struct witness *w, int indent) { int i; for (i = 0; i < indent; i++) prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); if (w->w_displayed) { prnt(" -- (already displayed)\n"); return; } w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); else prnt(" -- never acquired\n"); indent++; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (db_pager_quit) return; if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_display_descendants(prnt, &w_data[i], indent); } } static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_ddb_level > 0) continue; /* This lock has no anscestors - display its descendants. */ witness_ddb_display_descendants(prnt, w, 0); if (db_pager_quit) return; } } static void witness_ddb_display(int(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); witness_ddb_compute_levels(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_ddb_display_list(prnt, &w_sleep); if (db_pager_quit) return; /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_ddb_display_list(prnt, &w_spin); if (db_pager_quit) return; /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s (type: %s, depth: %d)\n", w->w_name, w->w_class->lc_name, w->w_ddb_level); if (db_pager_quit) return; } } #endif /* DDB */ int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == -1 || panicstr != NULL) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); mtx_assert(&w_mtx, MA_NOTOWNED); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (witness_watch && isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_witness->w_name, lock1->lo_witness->w_name); itismychild(lock1->lo_witness, lock2->lo_witness); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line, struct lock_object *interlock) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1, *lock2, *plock; struct lock_class *class, *iclass; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) kassert_panic("acquiring blockable sleep lock with " "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ lock_list = td->td_sleeplocks; if (lock_list == NULL || lock_list->ll_count == 0) return; } else { /* * If this is the first lock, just return as no order * checking is needed. Avoid problems with thread * migration pinning the thread while checking if * spinlocks are held. If at least one spinlock is held * the thread is in a safe path and it is allowed to * unpin it. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list == NULL || lock_list->ll_count == 0) { sched_unpin(); return; } sched_unpin(); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("excl->share"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("share->excl"); } return; } /* Warn if the interlock is not locked exactly once. */ if (interlock != NULL) { iclass = LOCK_CLASS(interlock); lock1 = find_instance(lock_list, interlock); if (lock1 == NULL) kassert_panic("interlock (%s) %s not locked @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); else if ((lock1->li_flags & LI_RECURSEMASK) != 0) kassert_panic("interlock (%s) %s recursed @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); } /* * Find the previously acquired lock, but ignore interlocks. */ plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) { if (lock_list->ll_count > 1) plock = &lock_list->ll_children[lock_list->ll_count - 2]; else { lle = lock_list->ll_next; /* * The interlock is the only lock we hold, so * simply return. */ if (lle == NULL) return; plock = &lle->ll_children[lle->ll_count - 1]; } } /* * Try to perform most checks without a lock. If this succeeds we * can skip acquiring the lock and return success. Otherwise we redo * the check with the lock held to handle races with concurrent updates. */ w1 = plock->li_lock->lo_witness; if (witness_lock_order_check(w1, w)) return; mtx_lock_spin(&w_mtx); if (witness_lock_order_check(w1, w)) { mtx_unlock_spin(&w_mtx); return; } witness_lock_order_add(w1, w); /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ if (w1 == w) { i = w->w_index; if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) && !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { w_rmatrix[i][i] |= WITNESS_REVERSAL; w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); witness_output(" 2nd %s @ %s:%d\n", lock->lo_name, fixup_filename(file), line); witness_debugger(1, __func__); } else mtx_unlock_spin(&w_mtx); return; } mtx_assert(&w_mtx, MA_OWNED); /* * If we know that the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) goto out; for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; /* * Ignore the interlock. */ if (interlock == lock1->li_lock) continue; /* * If this lock doesn't undergo witness checking, * then skip it. */ w1 = lock1->li_lock->lo_witness; if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ #ifdef BLESSING /* * If the lock order is blessed, just bail. We don't * look for other lock order violations though, which * may be a bug. */ if (blessed(w, w1)) goto out; #endif /* Bail if this violation is known */ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) goto out; /* Record this as a violation */ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; w->w_reversed = w1->w_reversed = 1; witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); #ifdef WITNESS_NO_VNODE /* * There are known LORs between VNODE locks. They are * not an indication of a bug. VNODE locks are flagged * as such (LO_IS_VNODE) and we don't yell if the LOR * is between 2 VNODE locks. */ if ((lock->lo_flags & LO_IS_VNODE) != 0 && (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0) return; #endif /* * Ok, yell about it. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) witness_output( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); else witness_output("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } else { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, lock2->li_lock->lo_witness->w_name, fixup_filename(lock2->li_file), lock2->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } witness_debugger(1, __func__); return; } } /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(plock->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, w->w_name, plock->li_lock->lo_witness->w_name); itismychild(plock->li_lock->lo_witness, w); } out: mtx_unlock_spin(&w_mtx); } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; td = curthread; /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags = LI_EXCLUSIVE; else instance->li_flags = 0; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || lock->lo_witness == NULL || panicstr != NULL) return; td = curthread; class = LOCK_CLASS(lock); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } /* * When disabling WITNESS through witness_watch we could end up in * having registered locks in the td_sleeplocks queue. * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ if (witness_watch > 0) { kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } else { return; } found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* The lock is now being dropped, check for NORELEASE flag */ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { witness_output("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* * In order to reduce contention on w_mtx, we want to keep always an * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have * further objects in the list, so the list ownership needs to be * hand over to another object if the current head needs to be freed. */ if ((*lock_list)->ll_count == 0) { if (*lock_list == lle) { if (lle->ll_next == NULL) return; } else lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } void witness_thread_exit(struct thread *td) { struct lock_list_entry *lle; int i, n; lle = td->td_sleeplocks; if (lle == NULL || panicstr != NULL) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { if (n == 0) witness_output( "Thread %p exiting with the following locks held:\n", td); n++; witness_list_lock(&lle->ll_children[i], witness_output); } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * output channel along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch < 1 || panicstr != NULL) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); witness_voutput(fmt, ap); va_end(ap); witness_output( " with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); } n++; witness_list_lock(lock1, witness_output); } /* * Pin the thread in order to avoid problems with thread migration. * Once that all verifies are passed about spinlocks ownership, * the thread is in a safe path and it can be unpinned. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list != NULL && lock_list->ll_count != 0) { sched_unpin(); /* * We should only have one spinlock and as long as * the flags cannot match for this locks class, * check if the first spinlock is the one curthread * should hold. */ lock1 = &lock_list->ll_children[lock_list->ll_count - 1]; if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0) return (0); va_start(ap, fmt); witness_voutput(fmt, ap); va_end(ap); witness_output(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); n += witness_list_locks(&lock_list, witness_output); } else sched_unpin(); if (flags & WARN_PANIC && n) kassert_panic("%s", __func__); else witness_debugger(n, __func__); return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; struct witness_list *typelist; MPASS(description != NULL); if (witness_watch == -1 || panicstr != NULL) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK)) { if (witness_skipspin) return (NULL); else typelist = &w_spin; } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) { typelist = &w_sleep; } else { kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); return (NULL); } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); if (w) goto found; if ((w = witness_get()) == NULL) return (NULL); MPASS(strlen(description) < MAX_W_NAME); strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } /* Insert new witness into the hash */ witness_hash_put(w); witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); return (w); found: w->w_refcount++; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } static void depart(struct witness *w) { struct witness_list *list; MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { list = &w_sleep; w_sleep_cnt--; } else { list = &w_spin; w_spin_cnt--; } /* * Set file to NULL as it may point into a loadable module. */ w->w_file = NULL; w->w_line = 0; witness_increment_graph_generation(); } static void adopt(struct witness *parent, struct witness *child) { int pi, ci, i, j; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); /* If the relationship is already known, there's no work to be done. */ if (isitmychild(parent, child)) return; /* When the structure of the graph changes, bump up the generation. */ witness_increment_graph_generation(); /* * The hard part ... create the direct relationship, then propagate all * indirect relationships. */ pi = parent->w_index; ci = child->w_index; WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); MPASS(pi != ci); w_rmatrix[pi][ci] |= WITNESS_PARENT; w_rmatrix[ci][pi] |= WITNESS_CHILD; /* * If parent was not already an ancestor of child, * then we increment the descendant and ancestor counters. */ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++; child->w_num_ancestors++; } /* * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { /* * Skip children that are already marked as * descendants of 'i'. */ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) continue; /* * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; w_rmatrix[j][i] |= WITNESS_DESCENDANT; w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; /* * Make sure we aren't marking a node as both an * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } } } } static void itismychild(struct witness *parent, struct witness *child) { int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { if (witness_cold == 0) { unlocked = 1; mtx_unlock_spin(&w_mtx); } else { unlocked = 0; } kassert_panic( "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); if (unlocked) mtx_lock_spin(&w_mtx); } adopt(parent, child); } /* * Generic code for the isitmy*() functions. The rmask parameter is the * expected relationship of w1 to w2. */ static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { unsigned char r1, r2; int i1, i2; i1 = w1->w_index; i2 = w2->w_index; WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2); r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; /* The flags on one better be the inverse of the flags on the other */ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ if (!mtx_owned(&w_mtx)) return (0); printf("%s: rmatrix mismatch between %s (index %d) and %s " "(index %d): w_rmatrix[%d][%d] == %hhx but " "w_rmatrix[%d][%d] == %hhx\n", fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, i2, i1, r2); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } return (r1 & rmask); } /* * Checks if @child is a direct child of @parent. */ static int isitmychild(struct witness *parent, struct witness *child) { return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } /* * Checks if @descendant is a direct or inderect descendant of @ancestor. */ static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } #ifdef BLESSING static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < nitems(blessed_list); i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } #endif static struct witness * witness_get(void) { struct witness *w; int index; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (witness_watch == -1) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; MPASS(index > 0 && index == w_max_used_index+1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; if (index > w_max_used_index) w_max_used_index = index; return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == -1) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *list, const struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)) { struct lock_object *lock; lock = instance->li_lock; prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_witness->w_name != lock->lo_name) prnt(" (%s)", lock->lo_witness->w_name); prnt(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, fixup_filename(instance->li_file), instance->li_line); } static int witness_output(const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = witness_voutput(fmt, ap); va_end(ap); return (ret); } static int witness_voutput(const char *fmt, va_list ap) { int ret; ret = 0; switch (witness_channel) { case WITNESS_CONSOLE: ret = vprintf(fmt, ap); break; case WITNESS_LOG: vlog(LOG_NOTICE, fmt, ap); break; case WITNESS_NONE: break; } return (ret); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list, int (*prnt)(const char *fmt, ...)) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i], prnt); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner, int (*prnt)(const char *fmt, ...)) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance, prnt); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; if (instance == NULL) return; instance->li_file = file; instance->li_line = line; } void witness_assert(const struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); return; } switch (flags) { case LA_UNLOCKED: if (instance != NULL) kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } #endif /* INVARIANT_SUPPORT */ } static void witness_setflag(struct lock_object *lock, int flag, int set) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } if (set) instance->li_flags |= flag; else instance->li_flags &= ~flag; } void witness_norelease(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 0); } #ifdef DDB static void witness_ddb_list(struct thread *td) { KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch < 1) return; witness_list_locks(&td->td_sleeplocks, db_printf); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks), db_printf); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; witness_ddb_list(td); } DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, p->p_comm, td, td->td_tid); witness_ddb_list(td); if (db_pager_quit) return; } } } DB_SHOW_ALIAS(alllocks, db_witness_list_all) DB_SHOW_COMMAND(witness, db_witness_display) { witness_ddb_display(db_printf); } #endif static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; struct sbuf *sb; u_int w_rmatrix1, w_rmatrix2; int error, generation, i, j; tmp_data1 = NULL; tmp_data2 = NULL; tmp_w1 = NULL; tmp_w2 = NULL; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND); if (sb == NULL) return (ENOMEM); /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ req->oldidx = 0; sbuf_clear(sb); goto restart; } w1 = &w_data[i]; if (w1->w_reversed == 0) { mtx_unlock_spin(&w_mtx); continue; } /* Copy w1 locally so we can release the spin lock. */ *tmp_w1 = *w1; mtx_unlock_spin(&w_mtx); if (tmp_w1->w_reversed == 0) continue; for (j = 1; j < w_max_used_index; j++) { if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) continue; mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ req->oldidx = 0; sbuf_clear(sb); goto restart; } w2 = &w_data[j]; data1 = witness_lock_order_get(w1, w2); data2 = witness_lock_order_get(w2, w1); /* * Copy information locally so we can release the * spin lock. */ *tmp_w2 = *w2; w_rmatrix1 = (unsigned int)w_rmatrix[i][j]; w_rmatrix2 = (unsigned int)w_rmatrix[j][i]; if (data1) { stack_zero(&tmp_data1->wlod_stack); stack_copy(&data1->wlod_stack, &tmp_data1->wlod_stack); } if (data2 && data2 != data1) { stack_zero(&tmp_data2->wlod_stack); stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_printf(sb, "\n"); } if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_printf(sb, "\n"); } } } mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* * The graph changed while we were printing stack data, * try again. */ req->oldidx = 0; sbuf_clear(sb); goto restart; } mtx_unlock_spin(&w_mtx); /* Free temporary storage space. */ free(tmp_data1, M_TEMP); free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS) { static const struct { enum witness_channel channel; const char *name; } channels[] = { { WITNESS_CONSOLE, "console" }, { WITNESS_LOG, "log" }, { WITNESS_NONE, "none" }, }; char buf[16]; u_int i; int error; buf[0] = '\0'; for (i = 0; i < nitems(channels); i++) if (witness_channel == channels[i].channel) { snprintf(buf, sizeof(buf), "%s", channels[i].name); break; } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); error = EINVAL; for (i = 0; i < nitems(channels); i++) if (strcmp(channels[i].name, buf) == 0) { witness_channel = channels[i].channel; error = 0; break; } return (error); } static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) { struct witness *w; struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "\n"); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; STAILQ_FOREACH(w, &w_all, w_list) witness_add_fullgraph(sb, w); mtx_unlock_spin(&w_mtx); /* * Close the sbuf and return to userland. */ error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value > 1 || value < -1 || (witness_watch == -1 && value != witness_watch)) return (EINVAL); witness_watch = value; return (0); } static void witness_add_fullgraph(struct sbuf *sb, struct witness *w) { int i; if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0)) return; w->w_displayed = 1; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name, w_data[i].w_name); witness_add_fullgraph(sb, &w_data[i]); } } } /* * A simple hash function. Takes a key pointer and a key size. If size == 0, * interprets the key as a string and reads until the null * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit * hash value computed from the key. */ static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size) { unsigned int hash = 5381; int i; /* hash = hash * 33 + key[i] */ if (size) for (i = 0; i < size; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; else for (i = 0; key[i] != 0; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; return (hash); } /* * Initializes the two witness hash tables. Called exactly once from * witness_initialize(). */ static void witness_init_hash_tables(void) { int i; MPASS(witness_cold); /* Initialize the hash tables. */ for (i = 0; i < WITNESS_HASH_SIZE; i++) w_hash.wh_array[i] = NULL; w_hash.wh_size = WITNESS_HASH_SIZE; w_hash.wh_count = 0; /* Initialize the lock order data hash. */ w_lofree = NULL; for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) { memset(&w_lodata[i], 0, sizeof(w_lodata[i])); w_lodata[i].wlod_next = w_lofree; w_lofree = &w_lodata[i]; } w_lohash.wloh_size = WITNESS_LO_HASH_SIZE; w_lohash.wloh_count = 0; for (i = 0; i < WITNESS_LO_HASH_SIZE; i++) w_lohash.wloh_array[i] = NULL; } static struct witness * witness_hash_get(const char *key) { struct witness *w; uint32_t hash; MPASS(key != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); hash = witness_hash_djb2(key, 0) % w_hash.wh_size; w = w_hash.wh_array[hash]; while (w != NULL) { if (strcmp(w->w_name, key) == 0) goto out; w = w->w_hash_next; } out: return (w); } static void witness_hash_put(struct witness *w) { uint32_t hash; MPASS(w != NULL); MPASS(w->w_name != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); KASSERT(witness_hash_get(w->w_name) == NULL, ("%s: trying to add a hash entry that already exists!", __func__)); KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; w->w_hash_next = w_hash.wh_array[hash]; w_hash.wh_array[hash] = w; w_hash.wh_count++; } static struct witness_lock_order_data * witness_lock_order_get(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if ((w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { if (witness_lock_order_key_equal(&data->wlod_key, &key)) break; data = data->wlod_next; } out: return (data); } /* * Verify that parent and child have a known relationship, are not the same, * and child is actually a child of parent. This is done without w_mtx * to avoid contention in the common case. */ static int witness_lock_order_check(struct witness *parent, struct witness *child) { if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && isitmychild(parent, child)) return (1); return (0); } static int witness_lock_order_add(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) return (1); hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; if (data == NULL) return (0); w_lofree = data->wlod_next; data->wlod_next = w_lohash.wloh_array[hash]; data->wlod_key = key; w_lohash.wloh_array[hash] = data; w_lohash.wloh_count++; stack_zero(&data->wlod_stack); stack_save(&data->wlod_stack); return (1); } /* Call this whenever the structure of the witness graph changes. */ static void witness_increment_graph_generation(void) { if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; } static int witness_output_drain(void *arg __unused, const char *data, int len) { witness_output("%.*s", len, data); return (len); } static void witness_debugger(int cond, const char *msg) { char buf[32]; struct sbuf sb; struct stack st; if (!cond) return; if (witness_trace) { sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); stack_zero(&st); stack_save(&st); witness_output("stack backtrace:\n"); stack_sbuf_print_ddb(&sb, &st); sbuf_finish(&sb); } #ifdef KDB if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, msg); #endif } Index: stable/11/sys/kern/vfs_cache.c =================================================================== --- stable/11/sys/kern/vfs_cache.c (revision 310958) +++ stable/11/sys/kern/vfs_cache.c (revision 310959) @@ -1,1510 +1,2454 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #ifdef KTRACE #include #endif #include SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); -SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", - "char *"); +SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *", + "char *", "int"); +SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *", + "char *", "int"); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ - struct vnode *nc_vp; /* vnode the name refers to */ + union { + struct vnode *nu_vp; /* vnode the name refers to */ + u_int nu_neghits; /* negative entry hits */ + } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. */ struct namecache_ts { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ - struct vnode *nc_vp; /* vnode the name refers to */ + union { + struct vnode *nu_vp; /* vnode the name refers to */ + u_int nu_neghits; /* negative entry hits */ + } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ char nc_name[0]; /* segment name + nul */ }; +#define nc_vp n_un.nu_vp +#define nc_neghits n_un.nu_neghits + /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 +#define NCF_DVDROP 0x10 +#define NCF_NEGATIVE 0x20 +#define NCF_HOTNEGATIVE 0x40 /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. + * + * These locks are used (in the order in which they can be taken): + * NAME TYPE ROLE + * vnodelock mtx vnode lists and v_cache_dd field protection + * bucketlock rwlock for access to given set of hash buckets + * neglist mtx negative entry LRU management + * + * Additionally, ncneg_shrink_lock mtx is used to have at most one thread + * shrinking the LRU list. + * + * It is legal to take multiple vnodelock and bucketlock locks. The locking + * order is lower address first. Both are recursive. + * + * "." lookups are lockless. + * + * ".." and vnode -> name lookups require vnodelock. + * + * name -> vnode lookup requires the relevant bucketlock to be held for reading. + * + * Insertions and removals of entries require involved vnodes and bucketlocks + * to be write-locked to prevent other threads from seeing the entry. + * + * Some lookups result in removal of the found entry (e.g. getting rid of a + * negative entry with the intent to create a positive one), which poses a + * problem when multiple threads reach the state. Similarly, two different + * threads can purge two different vnodes and try to remove the same name. + * + * If the already held vnode lock is lower than the second required lock, we + * can just take the other lock. However, in the opposite case, this could + * deadlock. As such, this is resolved by trylocking and if that fails unlocking + * the first node, locking everything in order and revalidating the state. */ /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ -static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); static u_long numneg; /* number of negative entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "Number of negative entries in namecache"); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "Number of namecache entries"); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "Number of namecache entries with vnodes held"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); +static u_int ncpurgeminvnodes; +SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, + "Number of vnodes below which purgevfs ignores the request"); +static u_int ncneghitsrequeue = 8; +SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, + "Number of hits to requeue a negative entry in the LRU list"); struct nchstats nchstats; /* cache effectiveness statistics */ -static struct rwlock cache_lock; -RW_SYSINIT(vfscache, &cache_lock, "Name Cache"); +static struct mtx ncneg_shrink_lock; +MTX_SYSINIT(vfscache_shrink_neg, &ncneg_shrink_lock, "Name Cache shrink neg", + MTX_DEF); -#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock) -#define CACHE_RLOCK() rw_rlock(&cache_lock) -#define CACHE_RUNLOCK() rw_runlock(&cache_lock) -#define CACHE_WLOCK() rw_wlock(&cache_lock) -#define CACHE_WUNLOCK() rw_wunlock(&cache_lock) +struct neglist { + struct mtx nl_lock; + TAILQ_HEAD(, namecache) nl_list; +} __aligned(CACHE_LINE_SIZE); +static struct neglist *neglists; +static struct neglist ncneg_hot; + +static int shrink_list_turn; + +static u_int numneglists; +static inline struct neglist * +NCP2NEGLIST(struct namecache *ncp) +{ + + return (&neglists[(((uintptr_t)(ncp) >> 8) % numneglists)]); +} + +static u_int numbucketlocks; +static struct rwlock_padalign *bucketlocks; +#define HASH2BUCKETLOCK(hash) \ + ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)])) + +static u_int numvnodelocks; +static struct mtx *vnodelocks; +static inline struct mtx * +VP2VNODELOCK(struct vnode *vp) +{ + struct mtx *vlp; + + if (vp == NULL) + return (NULL); + vlp = &vnodelocks[(((uintptr_t)(vp) >> 8) % numvnodelocks)]; + return (vlp); +} + /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_small_ts; static uma_zone_t cache_zone_large; static uma_zone_t cache_zone_large_ts; #define CACHE_PATH_CUTOFF 35 static struct namecache * cache_alloc(int len, int ts) { if (len > CACHE_PATH_CUTOFF) { if (ts) return (uma_zalloc(cache_zone_large_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_large, M_WAITOK)); } if (ts) return (uma_zalloc(cache_zone_small_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_small, M_WAITOK)); } static void cache_free(struct namecache *ncp) { int ts; if (ncp == NULL) return; ts = ncp->nc_flag & NCF_TS; + if ((ncp->nc_flag & NCF_DVDROP) != 0) + vdrop(ncp->nc_dvp); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) { if (ts) uma_zfree(cache_zone_small_ts, ncp); else uma_zfree(cache_zone_small, ncp); } else if (ts) uma_zfree(cache_zone_large_ts, ncp); else uma_zfree(cache_zone_large, ncp); } static char * nc_get_name(struct namecache *ncp) { struct namecache_ts *ncp_ts; if ((ncp->nc_flag & NCF_TS) == 0) return (ncp->nc_name); ncp_ts = (struct namecache_ts *)ncp; return (ncp_ts->nc_name); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp != NULL) *tsp = ((struct namecache_ts *)ncp)->nc_time; if (ticksp != NULL) *ticksp = ((struct namecache_ts *)ncp)->nc_ticks; } static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE_ULONG(name, descr) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); #define STATNODE_COUNTER(name, descr) \ static counter_u64_t name; \ SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); STATNODE_ULONG(numneg, "Number of negative cache entries"); STATNODE_ULONG(numcache, "Number of cache entries"); STATNODE_COUNTER(numcalls, "Number of cache lookups"); STATNODE_COUNTER(dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothits, "Number of '..' hits"); STATNODE_COUNTER(numchecks, "Number of checks in lookup"); STATNODE_COUNTER(nummiss, "Number of cache misses"); STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); /* These count for kern___getcwd(), too. */ STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); -static long numupgrades; STATNODE_ULONG(numupgrades, - "Number of updates of the cache after lookup (write lock + retry)"); +static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, + "Number of times zap_and_exit failed to lock"); +static long cache_lock_vnodes_cel_3_failures; +STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, + "Number of times 3-way vnode locking failed"); -static void cache_zap(struct namecache *ncp); -static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, - u_int *buflen); +static void cache_zap_locked(struct namecache *ncp, bool neg_locked); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); +static int cache_yield; +SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, + "Number of times cache called yield"); + +static void +cache_maybe_yield(void) +{ + + if (should_yield()) { + cache_yield++; + kern_yield(PRI_USER); + } +} + +static inline void +cache_assert_vlp_locked(struct mtx *vlp) +{ + + if (vlp != NULL) + mtx_assert(vlp, MA_OWNED); +} + +static inline void +cache_assert_vnode_locked(struct vnode *vp) +{ + struct mtx *vlp; + + vlp = VP2VNODELOCK(vp); + cache_assert_vlp_locked(vlp); +} + static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); return (hash); } +static inline struct rwlock * +NCP2BUCKETLOCK(struct namecache *ncp) +{ + uint32_t hash; + + hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); + return (HASH2BUCKETLOCK(hash)); +} + +#ifdef INVARIANTS +static void +cache_assert_bucket_locked(struct namecache *ncp, int mode) +{ + struct rwlock *blp; + + blp = NCP2BUCKETLOCK(ncp); + rw_assert(blp, mode); +} +#else +#define cache_assert_bucket_locked(x, y) do { } while (0) +#endif + +#define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y)) +static void +_cache_sort(void **p1, void **p2) +{ + void *tmp; + + if (*p1 > *p2) { + tmp = *p2; + *p2 = *p1; + *p1 = tmp; + } +} + +static void +cache_lock_all_buckets(void) +{ + u_int i; + + for (i = 0; i < numbucketlocks; i++) + rw_wlock(&bucketlocks[i]); +} + +static void +cache_unlock_all_buckets(void) +{ + u_int i; + + for (i = 0; i < numbucketlocks; i++) + rw_wunlock(&bucketlocks[i]); +} + +static void +cache_lock_all_vnodes(void) +{ + u_int i; + + for (i = 0; i < numvnodelocks; i++) + mtx_lock(&vnodelocks[i]); +} + +static void +cache_unlock_all_vnodes(void) +{ + u_int i; + + for (i = 0; i < numvnodelocks; i++) + mtx_unlock(&vnodelocks[i]); +} + static int +cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) +{ + + cache_sort(&vlp1, &vlp2); + MPASS(vlp2 != NULL); + + if (vlp1 != NULL) { + if (!mtx_trylock(vlp1)) + return (EAGAIN); + } + if (!mtx_trylock(vlp2)) { + if (vlp1 != NULL) + mtx_unlock(vlp1); + return (EAGAIN); + } + + return (0); +} + +static void +cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) +{ + + MPASS(vlp1 != NULL || vlp2 != NULL); + + if (vlp1 != NULL) + mtx_unlock(vlp1); + if (vlp2 != NULL) + mtx_unlock(vlp2); +} + +static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); - CACHE_RLOCK(); + cache_lock_all_buckets(); if (n_nchash != nchash + 1) { - CACHE_RUNLOCK(); + cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) LIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; - CACHE_RUNLOCK(); + cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); - CACHE_RLOCK(); + cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; - CACHE_RUNLOCK(); + cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* - * cache_zap(): + * Negative entries management * + * A variation of LRU scheme is used. New entries are hashed into one of + * numneglists cold lists. Entries get promoted to the hot list on first hit. + * Partial LRU for the hot list is maintained by requeueing them every + * ncneghitsrequeue hits. + * + * The shrinker will demote hot list head and evict from the cold list in a + * round-robin manner. + */ +static void +cache_negative_hit(struct namecache *ncp) +{ + struct neglist *neglist; + u_int hits; + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + hits = atomic_fetchadd_int(&ncp->nc_neghits, 1); + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + if ((hits % ncneghitsrequeue) != 0) + return; + mtx_lock(&ncneg_hot.nl_lock); + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); + mtx_unlock(&ncneg_hot.nl_lock); + return; + } + /* + * The shrinker cleared the flag and removed the entry from + * the hot list. Put it back. + */ + } else { + mtx_lock(&ncneg_hot.nl_lock); + } + neglist = NCP2NEGLIST(ncp); + mtx_lock(&neglist->nl_lock); + if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { + TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); + TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); + ncp->nc_flag |= NCF_HOTNEGATIVE; + } + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); +} + +static void +cache_negative_insert(struct namecache *ncp, bool neg_locked) +{ + struct neglist *neglist; + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + cache_assert_bucket_locked(ncp, RA_WLOCKED); + neglist = NCP2NEGLIST(ncp); + if (!neg_locked) { + mtx_lock(&neglist->nl_lock); + } else { + mtx_assert(&neglist->nl_lock, MA_OWNED); + } + TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); + if (!neg_locked) + mtx_unlock(&neglist->nl_lock); + atomic_add_rel_long(&numneg, 1); +} + +static void +cache_negative_remove(struct namecache *ncp, bool neg_locked) +{ + struct neglist *neglist; + bool hot_locked = false; + bool list_locked = false; + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + cache_assert_bucket_locked(ncp, RA_WLOCKED); + neglist = NCP2NEGLIST(ncp); + if (!neg_locked) { + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + hot_locked = true; + mtx_lock(&ncneg_hot.nl_lock); + if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { + list_locked = true; + mtx_lock(&neglist->nl_lock); + } + } else { + list_locked = true; + mtx_lock(&neglist->nl_lock); + } + } else { + mtx_assert(&neglist->nl_lock, MA_OWNED); + mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); + } + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + } else { + TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); + } + if (list_locked) + mtx_unlock(&neglist->nl_lock); + if (hot_locked) + mtx_unlock(&ncneg_hot.nl_lock); + atomic_subtract_rel_long(&numneg, 1); +} + +static void +cache_negative_shrink_select(int start, struct namecache **ncpp, + struct neglist **neglistpp) +{ + struct neglist *neglist; + struct namecache *ncp; + int i; + + *ncpp = ncp = NULL; + + for (i = start; i < numneglists; i++) { + neglist = &neglists[i]; + if (TAILQ_FIRST(&neglist->nl_list) == NULL) + continue; + mtx_lock(&neglist->nl_lock); + ncp = TAILQ_FIRST(&neglist->nl_list); + if (ncp != NULL) + break; + mtx_unlock(&neglist->nl_lock); + } + + *neglistpp = neglist; + *ncpp = ncp; +} + +static void +cache_negative_zap_one(void) +{ + struct namecache *ncp, *ncp2; + struct neglist *neglist; + struct mtx *dvlp; + struct rwlock *blp; + + if (!mtx_trylock(&ncneg_shrink_lock)) + return; + + mtx_lock(&ncneg_hot.nl_lock); + ncp = TAILQ_FIRST(&ncneg_hot.nl_list); + if (ncp != NULL) { + neglist = NCP2NEGLIST(ncp); + mtx_lock(&neglist->nl_lock); + TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); + ncp->nc_flag &= ~NCF_HOTNEGATIVE; + mtx_unlock(&neglist->nl_lock); + } + + cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); + shrink_list_turn++; + if (shrink_list_turn == numneglists) + shrink_list_turn = 0; + if (ncp == NULL && shrink_list_turn == 0) + cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); + if (ncp == NULL) { + mtx_unlock(&ncneg_hot.nl_lock); + goto out; + } + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + dvlp = VP2VNODELOCK(ncp->nc_dvp); + blp = NCP2BUCKETLOCK(ncp); + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); + mtx_lock(dvlp); + rw_wlock(blp); + mtx_lock(&ncneg_hot.nl_lock); + mtx_lock(&neglist->nl_lock); + ncp2 = TAILQ_FIRST(&neglist->nl_list); + if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || + blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { + ncp = NULL; + goto out_unlock_all; + } + SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp, + nc_get_name(ncp), ncp->nc_neghits); + + cache_zap_locked(ncp, true); +out_unlock_all: + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); + rw_wunlock(blp); + mtx_unlock(dvlp); +out: + mtx_unlock(&ncneg_shrink_lock); + cache_free(ncp); +} + +/* + * cache_zap_locked(): + * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void -cache_zap(struct namecache *ncp) +cache_zap_locked(struct namecache *ncp, bool neg_locked) { - struct vnode *vp; - rw_assert(&cache_lock, RA_WLOCKED); - CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); - if (ncp->nc_vp != NULL) { + if (!(ncp->nc_flag & NCF_NEGATIVE)) + cache_assert_vnode_locked(ncp->nc_vp); + cache_assert_vnode_locked(ncp->nc_dvp); + cache_assert_bucket_locked(ncp, RA_WLOCKED); + + CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, + (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); + if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, nc_get_name(ncp), ncp->nc_vp); } else { - SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, - nc_get_name(ncp)); + SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp, + nc_get_name(ncp), ncp->nc_neghits); } - vp = NULL; LIST_REMOVE(ncp, nc_hash); + if (!(ncp->nc_flag & NCF_NEGATIVE)) { + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); + if (ncp == ncp->nc_vp->v_cache_dd) + ncp->nc_vp->v_cache_dd = NULL; + } else { + cache_negative_remove(ncp, neg_locked); + } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) ncp->nc_dvp->v_cache_dd = NULL; } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { - vp = ncp->nc_dvp; - numcachehv--; + ncp->nc_flag |= NCF_DVDROP; + atomic_subtract_rel_long(&numcachehv, 1); } } - if (ncp->nc_vp) { - TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); - if (ncp == ncp->nc_vp->v_cache_dd) - ncp->nc_vp->v_cache_dd = NULL; + atomic_subtract_rel_long(&numcache, 1); +} + +static void +cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) +{ + struct rwlock *blp; + + MPASS(ncp->nc_dvp == vp); + MPASS(ncp->nc_flag & NCF_NEGATIVE); + cache_assert_vnode_locked(vp); + + blp = NCP2BUCKETLOCK(ncp); + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); +} + +static bool +cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, + struct mtx **vlpp) +{ + struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; + struct rwlock *blp; + + MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); + cache_assert_vnode_locked(vp); + + if (ncp->nc_flag & NCF_NEGATIVE) { + if (*vlpp != NULL) { + mtx_unlock(*vlpp); + *vlpp = NULL; + } + cache_zap_negative_locked_vnode_kl(ncp, vp); + return (true); + } + + pvlp = VP2VNODELOCK(vp); + blp = NCP2BUCKETLOCK(ncp); + vlp1 = VP2VNODELOCK(ncp->nc_dvp); + vlp2 = VP2VNODELOCK(ncp->nc_vp); + + if (*vlpp == vlp1 || *vlpp == vlp2) { + to_unlock = *vlpp; + *vlpp = NULL; } else { - TAILQ_REMOVE(&ncneg, ncp, nc_dst); - numneg--; + if (*vlpp != NULL) { + mtx_unlock(*vlpp); + *vlpp = NULL; + } + cache_sort(&vlp1, &vlp2); + if (vlp1 == pvlp) { + mtx_lock(vlp2); + to_unlock = vlp2; + } else { + if (!mtx_trylock(vlp1)) + goto out_relock; + to_unlock = vlp1; + } } - numcache--; - cache_free(ncp); - if (vp != NULL) - vdrop(vp); + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); + if (to_unlock != NULL) + mtx_unlock(to_unlock); + return (true); + +out_relock: + mtx_unlock(vlp2); + mtx_lock(vlp1); + mtx_lock(vlp2); + MPASS(*vlpp == NULL); + *vlpp = vlp1; + return (false); } +static int +cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) +{ + struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; + struct rwlock *blp; + int error = 0; + + MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); + cache_assert_vnode_locked(vp); + + pvlp = VP2VNODELOCK(vp); + if (ncp->nc_flag & NCF_NEGATIVE) { + cache_zap_negative_locked_vnode_kl(ncp, vp); + goto out; + } + + blp = NCP2BUCKETLOCK(ncp); + vlp1 = VP2VNODELOCK(ncp->nc_dvp); + vlp2 = VP2VNODELOCK(ncp->nc_vp); + cache_sort(&vlp1, &vlp2); + if (vlp1 == pvlp) { + mtx_lock(vlp2); + to_unlock = vlp2; + } else { + if (!mtx_trylock(vlp1)) { + error = EAGAIN; + goto out; + } + to_unlock = vlp1; + } + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); + mtx_unlock(to_unlock); +out: + mtx_unlock(pvlp); + return (error); +} + +static int +cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp) +{ + struct mtx *dvlp, *vlp; + + cache_assert_bucket_locked(ncp, RA_RLOCKED); + + dvlp = VP2VNODELOCK(ncp->nc_dvp); + vlp = NULL; + if (!(ncp->nc_flag & NCF_NEGATIVE)) + vlp = VP2VNODELOCK(ncp->nc_vp); + if (cache_trylock_vnodes(dvlp, vlp) == 0) { + rw_runlock(blp); + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); + cache_unlock_vnodes(dvlp, vlp); + return (0); + } + + rw_runlock(blp); + return (EAGAIN); +} + +static int +cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, + struct mtx **vlpp1, struct mtx **vlpp2) +{ + struct mtx *dvlp, *vlp; + + cache_assert_bucket_locked(ncp, RA_WLOCKED); + + dvlp = VP2VNODELOCK(ncp->nc_dvp); + vlp = NULL; + if (!(ncp->nc_flag & NCF_NEGATIVE)) + vlp = VP2VNODELOCK(ncp->nc_vp); + cache_sort(&dvlp, &vlp); + + if (*vlpp1 == dvlp && *vlpp2 == vlp) { + cache_zap_locked(ncp, false); + cache_unlock_vnodes(dvlp, vlp); + *vlpp1 = NULL; + *vlpp2 = NULL; + return (0); + } + + if (*vlpp1 != NULL) + mtx_unlock(*vlpp1); + if (*vlpp2 != NULL) + mtx_unlock(*vlpp2); + *vlpp1 = NULL; + *vlpp2 = NULL; + + if (cache_trylock_vnodes(dvlp, vlp) == 0) { + cache_zap_locked(ncp, false); + cache_unlock_vnodes(dvlp, vlp); + return (0); + } + + rw_wunlock(blp); + *vlpp1 = dvlp; + *vlpp2 = vlp; + if (*vlpp1 != NULL) + mtx_lock(*vlpp1); + mtx_lock(*vlpp2); + rw_wlock(blp); + return (EAGAIN); +} + +static void +cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) +{ + + if (blp != NULL) { + rw_runlock(blp); + mtx_assert(vlp, MA_NOTOWNED); + } else { + mtx_unlock(vlp); + } +} + /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative caching), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. If the directory vnode is * recycled out from under us due to a forced unmount, a status of * ENOENT is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; + struct rwlock *blp; + struct mtx *dvlp, *dvlp2; uint32_t hash; - int error, ltype, wlocked; + int error, ltype; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: - wlocked = 0; - counter_u64_add(numcalls, 1); + blp = NULL; + dvlp = VP2VNODELOCK(dvp); error = 0; + counter_u64_add(numcalls, 1); -retry_wlocked: if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; VREF(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if ((*vpp)->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } - if (!wlocked) - CACHE_RLOCK(); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { counter_u64_add(dotdothits, 1); - if (dvp->v_cache_dd == NULL) { + dvlp2 = NULL; + mtx_lock(dvlp); +retry_dotdot: + ncp = dvp->v_cache_dd; + if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); - goto unlock; + mtx_unlock(dvlp); + return (0); } if ((cnp->cn_flags & MAKEENTRY) == 0) { - if (!wlocked && !CACHE_UPGRADE_LOCK()) - goto wlock; - if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) - cache_zap(dvp->v_cache_dd); - dvp->v_cache_dd = NULL; - CACHE_WUNLOCK(); + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { + if (ncp->nc_dvp != dvp) + panic("dvp %p v_cache_dd %p\n", dvp, ncp); + if (!cache_zap_locked_vnode_kl2(ncp, + dvp, &dvlp2)) + goto retry_dotdot; + MPASS(dvp->v_cache_dd == NULL); + mtx_unlock(dvlp); + if (dvlp2 != NULL) + mtx_unlock(dvlp2); + cache_free(ncp); + } else { + dvp->v_cache_dd = NULL; + mtx_unlock(dvlp); + if (dvlp2 != NULL) + mtx_unlock(dvlp2); + } return (0); } - ncp = dvp->v_cache_dd; - if (ncp->nc_flag & NCF_ISDOTDOT) - *vpp = ncp->nc_vp; - else + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { + if (ncp->nc_flag & NCF_NEGATIVE) + *vpp = NULL; + else + *vpp = ncp->nc_vp; + } else *vpp = ncp->nc_dvp; /* Return failure if negative entry was found. */ if (*vpp == NULL) goto negative_success; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) *tsp = ((struct namecache_ts *)ncp)-> nc_dotdottime; goto success; } - } else if (!wlocked) - CACHE_RLOCK(); + } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + blp = HASH2BUCKETLOCK(hash); + rw_rlock(blp); + LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(nummisszap, 1); } else { counter_u64_add(nummiss, 1); } goto unlock; } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(numposzaps, 1); - if (!wlocked && !CACHE_UPGRADE_LOCK()) - goto wlock; - cache_zap(ncp); - CACHE_WUNLOCK(); - return (0); + goto zap_and_exit; } /* We found a "positive" match, return the vnode */ - if (ncp->nc_vp) { + if (!(ncp->nc_flag & NCF_NEGATIVE)) { counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp), *vpp); cache_out_ts(ncp, tsp, ticksp); goto success; } negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { counter_u64_add(numnegzaps, 1); - if (!wlocked && !CACHE_UPGRADE_LOCK()) - goto wlock; - cache_zap(ncp); - CACHE_WUNLOCK(); - return (0); + goto zap_and_exit; } - if (!wlocked && !CACHE_UPGRADE_LOCK()) - goto wlock; counter_u64_add(numneghits, 1); - /* - * We found a "negative" match, so we shift it to the end of - * the "negative" cache entries queue to satisfy LRU. Also, - * check to see if the entry is a whiteout; indicate this to - * the componentname, if so. - */ - TAILQ_REMOVE(&ncneg, ncp, nc_dst); - TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); + cache_negative_hit(ncp); if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, nc_get_name(ncp)); cache_out_ts(ncp, tsp, ticksp); - CACHE_WUNLOCK(); + cache_lookup_unlock(blp, dvlp); return (ENOENT); -wlock: - /* - * We need to update the cache after our lookup, so upgrade to - * a write lock and retry the operation. - */ - CACHE_RUNLOCK(); - CACHE_WLOCK(); - numupgrades++; - wlocked = 1; - goto retry_wlocked; - success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ MPASS(dvp != *vpp); ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } vhold(*vpp); - if (wlocked) - CACHE_WUNLOCK(); - else - CACHE_RUNLOCK(); + cache_lookup_unlock(blp, dvlp); error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); if (dvp->v_iflag & VI_DOOMED) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } } if (error) { *vpp = NULL; goto retry; } if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); } return (-1); unlock: - if (wlocked) - CACHE_WUNLOCK(); + cache_lookup_unlock(blp, dvlp); + return (0); + +zap_and_exit: + if (blp != NULL) + error = cache_zap_rlocked_bucket(ncp, blp); else - CACHE_RUNLOCK(); + error = cache_zap_locked_vnode(ncp, dvp); + if (error != 0) { + zap_and_exit_bucket_fail++; + cache_maybe_yield(); + goto retry; + } + cache_free(ncp); return (0); } +struct celockstate { + struct mtx *vlp[3]; + struct rwlock *blp[2]; +}; +CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); +CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); + +static inline void +cache_celockstate_init(struct celockstate *cel) +{ + + bzero(cel, sizeof(*cel)); +} + +static void +cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, + struct vnode *dvp) +{ + struct mtx *vlp1, *vlp2; + + MPASS(cel->vlp[0] == NULL); + MPASS(cel->vlp[1] == NULL); + MPASS(cel->vlp[2] == NULL); + + MPASS(vp != NULL || dvp != NULL); + + vlp1 = VP2VNODELOCK(vp); + vlp2 = VP2VNODELOCK(dvp); + cache_sort(&vlp1, &vlp2); + + if (vlp1 != NULL) { + mtx_lock(vlp1); + cel->vlp[0] = vlp1; + } + mtx_lock(vlp2); + cel->vlp[1] = vlp2; +} + +static void +cache_unlock_vnodes_cel(struct celockstate *cel) +{ + + MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); + + if (cel->vlp[0] != NULL) + mtx_unlock(cel->vlp[0]); + if (cel->vlp[1] != NULL) + mtx_unlock(cel->vlp[1]); + if (cel->vlp[2] != NULL) + mtx_unlock(cel->vlp[2]); +} + +static bool +cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) +{ + struct mtx *vlp; + bool ret; + + cache_assert_vlp_locked(cel->vlp[0]); + cache_assert_vlp_locked(cel->vlp[1]); + MPASS(cel->vlp[2] == NULL); + + vlp = VP2VNODELOCK(vp); + MPASS(vlp != NULL); + + ret = true; + if (vlp >= cel->vlp[1]) { + mtx_lock(vlp); + } else { + if (mtx_trylock(vlp)) + goto out; + cache_lock_vnodes_cel_3_failures++; + cache_unlock_vnodes_cel(cel); + if (vlp < cel->vlp[0]) { + mtx_lock(vlp); + mtx_lock(cel->vlp[0]); + mtx_lock(cel->vlp[1]); + } else { + if (cel->vlp[0] != NULL) + mtx_lock(cel->vlp[0]); + mtx_lock(vlp); + mtx_lock(cel->vlp[1]); + } + ret = false; + } +out: + cel->vlp[2] = vlp; + return (ret); +} + +static void +cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, + struct rwlock *blp2) +{ + + MPASS(cel->blp[0] == NULL); + MPASS(cel->blp[1] == NULL); + + cache_sort(&blp1, &blp2); + + if (blp1 != NULL) { + rw_wlock(blp1); + cel->blp[0] = blp1; + } + rw_wlock(blp2); + cel->blp[1] = blp2; +} + +static void +cache_unlock_buckets_cel(struct celockstate *cel) +{ + + if (cel->blp[0] != NULL) + rw_wunlock(cel->blp[0]); + rw_wunlock(cel->blp[1]); +} + /* + * Lock part of the cache affected by the insertion. + * + * This means vnodelocks for dvp, vp and the relevant bucketlock. + * However, insertion can result in removal of an old entry. In this + * case we have an additional vnode and bucketlock pair to lock. If the + * entry is negative, ncelock is locked instead of the vnode. + * + * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while + * preserving the locking order (smaller address first). + */ +static void +cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, + uint32_t hash) +{ + struct namecache *ncp; + struct rwlock *blps[2]; + + blps[0] = HASH2BUCKETLOCK(hash); + for (;;) { + blps[1] = NULL; + cache_lock_vnodes_cel(cel, dvp, vp); + if (vp == NULL || vp->v_type != VDIR) + break; + ncp = vp->v_cache_dd; + if (ncp == NULL) + break; + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) + break; + MPASS(ncp->nc_dvp == vp); + blps[1] = NCP2BUCKETLOCK(ncp); + if (ncp->nc_flag & NCF_NEGATIVE) + break; + if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) + break; + /* + * All vnodes got re-locked. Re-validate the state and if + * nothing changed we are done. Otherwise restart. + */ + if (ncp == vp->v_cache_dd && + (ncp->nc_flag & NCF_ISDOTDOT) != 0 && + blps[1] == NCP2BUCKETLOCK(ncp) && + VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) + break; + cache_unlock_vnodes_cel(cel); + cel->vlp[0] = NULL; + cel->vlp[1] = NULL; + cel->vlp[2] = NULL; + } + cache_lock_buckets_cel(cel, blps[0], blps[1]); +} + +static void +cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, + uint32_t hash) +{ + struct namecache *ncp; + struct rwlock *blps[2]; + + blps[0] = HASH2BUCKETLOCK(hash); + for (;;) { + blps[1] = NULL; + cache_lock_vnodes_cel(cel, dvp, vp); + ncp = dvp->v_cache_dd; + if (ncp == NULL) + break; + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) + break; + MPASS(ncp->nc_dvp == dvp); + blps[1] = NCP2BUCKETLOCK(ncp); + if (ncp->nc_flag & NCF_NEGATIVE) + break; + if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) + break; + if (ncp == dvp->v_cache_dd && + (ncp->nc_flag & NCF_ISDOTDOT) != 0 && + blps[1] == NCP2BUCKETLOCK(ncp) && + VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) + break; + cache_unlock_vnodes_cel(cel); + cel->vlp[0] = NULL; + cel->vlp[1] = NULL; + cel->vlp[2] = NULL; + } + cache_lock_buckets_cel(cel, blps[0], blps[1]); +} + +static void +cache_enter_unlock(struct celockstate *cel) +{ + + cache_unlock_buckets_cel(cel); + cache_unlock_vnodes_cel(cel); +} + +/* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { - struct namecache *ncp, *n2; + struct celockstate cel; + struct namecache *ncp, *n2, *ndd; struct namecache_ts *n3; struct nchashhead *ncpp; + struct neglist *neglist; uint32_t hash; int flag; int len; + bool neg_locked; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cache_enter: Adding a doomed vnode")); VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, ("cache_enter: Doomed vnode used as src")); if (!doingcache) return; /* * Avoid blowout in namecache entries. */ if (numcache >= desiredvnodes * ncsizefactor) return; + cache_celockstate_init(&cel); + ndd = NULL; flag = 0; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { - CACHE_WLOCK(); + len = cnp->cn_namelen; + hash = cache_get_hash(cnp->cn_nameptr, len, dvp); + cache_enter_lock_dd(&cel, dvp, vp, hash); /* * If dotdot entry already exists, just retarget it * to new parent vnode, otherwise continue with new * namecache entry allocation. */ if ((ncp = dvp->v_cache_dd) != NULL && ncp->nc_flag & NCF_ISDOTDOT) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); - if (ncp->nc_vp != NULL) { + neg_locked = false; + if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { + neglist = NCP2NEGLIST(ncp); + mtx_lock(&ncneg_hot.nl_lock); + mtx_lock(&neglist->nl_lock); + neg_locked = true; + } + if (!(ncp->nc_flag & NCF_NEGATIVE)) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); } else { - TAILQ_REMOVE(&ncneg, ncp, nc_dst); - numneg--; + cache_negative_remove(ncp, true); } if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); + ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); } else { - TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); - numneg++; + ncp->nc_flag &= ~(NCF_HOTNEGATIVE); + ncp->nc_flag |= NCF_NEGATIVE; + cache_negative_insert(ncp, true); } + if (neg_locked) { + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); + } ncp->nc_vp = vp; - CACHE_WUNLOCK(); + cache_enter_unlock(&cel); return; } dvp->v_cache_dd = NULL; + cache_enter_unlock(&cel); + cache_celockstate_init(&cel); SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); - CACHE_WUNLOCK(); flag = NCF_ISDOTDOT; } } /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); + ncp->nc_flag = flag; ncp->nc_vp = vp; + if (vp == NULL) + ncp->nc_flag |= NCF_NEGATIVE; ncp->nc_dvp = dvp; - ncp->nc_flag = flag; if (tsp != NULL) { n3 = (struct namecache_ts *)ncp; n3->nc_time = *tsp; n3->nc_ticks = ticks; n3->nc_flag |= NCF_TS; if (dtsp != NULL) { n3->nc_dotdottime = *dtsp; n3->nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1); - CACHE_WLOCK(); + cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) { if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n3 = (struct namecache_ts *)n2; n3->nc_time = ((struct namecache_ts *)ncp)->nc_time; n3->nc_ticks = ((struct namecache_ts *)ncp)->nc_ticks; if (dtsp != NULL) { n3->nc_dotdottime = ((struct namecache_ts *)ncp)-> nc_dotdottime; + if (ncp->nc_flag & NCF_NEGATIVE) + mtx_lock(&ncneg_hot.nl_lock); n3->nc_flag |= NCF_DTS; + if (ncp->nc_flag & NCF_NEGATIVE) + mtx_unlock(&ncneg_hot.nl_lock); } } - CACHE_WUNLOCK(); - cache_free(ncp); - return; + goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ - if (dvp->v_cache_dd != NULL) { - CACHE_WUNLOCK(); - cache_free(ncp); - return; - } + if (dvp->v_cache_dd != NULL) + goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); dvp->v_cache_dd = ncp; } - numcache++; + atomic_add_rel_long(&numcache, 1); if (vp != NULL) { if (vp->v_type == VDIR) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ - if ((n2 = vp->v_cache_dd) != NULL && - (n2->nc_flag & NCF_ISDOTDOT) != 0) - cache_zap(n2); + if ((ndd = vp->v_cache_dd) != NULL) { + if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) + cache_zap_locked(ndd, false); + else + ndd = NULL; + } vp->v_cache_dd = ncp; } } else { vp->v_cache_dd = NULL; } } - /* - * Insert the new namecache entry into the appropriate chain - * within the cache entries table. - */ - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); - numcachehv++; + atomic_add_rel_long(&numcachehv, 1); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* + * Insert the new namecache entry into the appropriate chain + * within the cache entries table. + */ + LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + + /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), vp); } else { if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; - TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); - numneg++; + cache_negative_insert(ncp, false); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, nc_get_name(ncp)); } - if (numneg * ncnegfactor > numcache) { - ncp = TAILQ_FIRST(&ncneg); - KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg", - ncp, ncp->nc_vp)); - cache_zap(ncp); - } - CACHE_WUNLOCK(); + cache_enter_unlock(&cel); + if (numneg * ncnegfactor > numcache) + cache_negative_zap_one(); + cache_free(ndd); + return; +out_unlock_free: + cache_enter_unlock(&cel); + cache_free(ncp); + return; } +static u_int +cache_roundup_2(u_int val) +{ + u_int res; + + for (res = 1; res <= val; res <<= 1) + continue; + + return (res); +} + /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { + u_int i; - TAILQ_INIT(&ncneg); - cache_zone_small = uma_zcreate("S VFS Cache", sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", sizeof(struct namecache) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", sizeof(struct namecache_ts) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); + numbucketlocks = cache_roundup_2(mp_ncpus * 64); + if (numbucketlocks > nchash + 1) + numbucketlocks = nchash + 1; + bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numbucketlocks; i++) + rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); + numvnodelocks = cache_roundup_2(mp_ncpus * 64); + vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numvnodelocks; i++) + mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); + ncpurgeminvnodes = numbucketlocks; + numneglists = 4; + neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numneglists; i++) { + mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); + TAILQ_INIT(&neglists[i].nl_list); + } + mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); + TAILQ_INIT(&ncneg_hot.nl_list); + numcalls = counter_u64_alloc(M_WAITOK); dothits = counter_u64_alloc(M_WAITOK); dotdothits = counter_u64_alloc(M_WAITOK); numchecks = counter_u64_alloc(M_WAITOK); nummiss = counter_u64_alloc(M_WAITOK); nummisszap = counter_u64_alloc(M_WAITOK); numposzaps = counter_u64_alloc(M_WAITOK); numposhits = counter_u64_alloc(M_WAITOK); numnegzaps = counter_u64_alloc(M_WAITOK); numneghits = counter_u64_alloc(M_WAITOK); numfullpathcalls = counter_u64_alloc(M_WAITOK); numfullpathfail1 = counter_u64_alloc(M_WAITOK); numfullpathfail2 = counter_u64_alloc(M_WAITOK); numfullpathfail4 = counter_u64_alloc(M_WAITOK); numfullpathfound = counter_u64_alloc(M_WAITOK); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_changesize(int newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl; u_long new_nchash, old_nchash; struct namecache *ncp; uint32_t hash; int i; - new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); + newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); + if (newmaxvnodes < numbucketlocks) + newmaxvnodes = numbucketlocks; + + new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { free(new_nchashtbl, M_VFSCACHE); return; } /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ - CACHE_WLOCK(); + cache_lock_all_vnodes(); + cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); LIST_REMOVE(ncp, nc_hash); LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } - CACHE_WUNLOCK(); + cache_unlock_all_buckets(); + cache_unlock_all_vnodes(); free(old_nchashtbl, M_VFSCACHE); } /* * Invalidate all entries to a particular vnode. */ void cache_purge(struct vnode *vp) { + TAILQ_HEAD(, namecache) ncps; + struct namecache *ncp, *nnp; + struct mtx *vlp, *vlp2; CTR1(KTR_VFS, "cache_purge(%p)", vp); SDT_PROBE1(vfs, namecache, purge, done, vp); - CACHE_WLOCK(); - while (!LIST_EMPTY(&vp->v_cache_src)) - cache_zap(LIST_FIRST(&vp->v_cache_src)); - while (!TAILQ_EMPTY(&vp->v_cache_dst)) - cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); - if (vp->v_cache_dd != NULL) { - KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT, + if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && + vp->v_cache_dd == NULL) + return; + TAILQ_INIT(&ncps); + vlp = VP2VNODELOCK(vp); + vlp2 = NULL; + mtx_lock(vlp); +retry: + while (!LIST_EMPTY(&vp->v_cache_src)) { + ncp = LIST_FIRST(&vp->v_cache_src); + if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) + goto retry; + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + } + while (!TAILQ_EMPTY(&vp->v_cache_dst)) { + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) + goto retry; + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + } + ncp = vp->v_cache_dd; + if (ncp != NULL) { + KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); - cache_zap(vp->v_cache_dd); + if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) + goto retry; + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); - CACHE_WUNLOCK(); + mtx_unlock(vlp); + if (vlp2 != NULL) + mtx_unlock(vlp2); + TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { + cache_free(ncp); + } } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { - struct namecache *cp, *ncp; + TAILQ_HEAD(, namecache) ncps; + struct namecache *ncp, *nnp; + struct mtx *vlp; CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); SDT_PROBE1(vfs, namecache, purge_negative, done, vp); - CACHE_WLOCK(); - LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) { - if (cp->nc_vp == NULL) - cache_zap(cp); + TAILQ_INIT(&ncps); + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); + LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { + if (!(ncp->nc_flag & NCF_NEGATIVE)) + continue; + cache_zap_negative_locked_vnode_kl(ncp, vp); + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } - CACHE_WUNLOCK(); + mtx_unlock(vlp); + TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { + cache_free(ncp); + } } /* * Flush all entries referencing a particular filesystem. */ void -cache_purgevfs(struct mount *mp) +cache_purgevfs(struct mount *mp, bool force) { - struct nchashhead *ncpp; + TAILQ_HEAD(, namecache) ncps; + struct mtx *vlp1, *vlp2; + struct rwlock *blp; + struct nchashhead *bucket; struct namecache *ncp, *nnp; + u_long i, j, n_nchash; + int error; /* Scan hash tables for applicable entries */ SDT_PROBE1(vfs, namecache, purgevfs, done, mp); - CACHE_WLOCK(); - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { - LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) { - if (ncp->nc_dvp->v_mount == mp) - cache_zap(ncp); + if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) + return; + TAILQ_INIT(&ncps); + n_nchash = nchash + 1; + vlp1 = vlp2 = NULL; + for (i = 0; i < numbucketlocks; i++) { + blp = (struct rwlock *)&bucketlocks[i]; + rw_wlock(blp); + for (j = i; j < n_nchash; j += numbucketlocks) { +retry: + bucket = &nchashtbl[j]; + LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { + cache_assert_bucket_locked(ncp, RA_WLOCKED); + if (ncp->nc_dvp->v_mount != mp) + continue; + error = cache_zap_wlocked_bucket_kl(ncp, blp, + &vlp1, &vlp2); + if (error != 0) + goto retry; + TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); + } } + rw_wunlock(blp); + if (vlp1 == NULL && vlp2 == NULL) + cache_maybe_yield(); } - CACHE_WUNLOCK(); + if (vlp1 != NULL) + mtx_unlock(vlp1); + if (vlp2 != NULL) + mtx_unlock(vlp2); + + TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { + cache_free(ncp); + } } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, MAXPATHLEN)); } int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, u_int path_max) { char *bp, *tmpbuf; struct filedesc *fdp; struct vnode *cdir, *rdir; int error; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > path_max) buflen = path_max; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); cdir = fdp->fd_cdir; vrefact(cdir); rdir = fdp->fd_rdir; vrefact(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); vrele(rdir); vrele(cdir); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(bp); #endif } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; struct vnode *rdir; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); vrele(rdir); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { - int error; - - CACHE_RLOCK(); - error = vn_vptocnp_locked(vp, cred, buf, buflen); - if (error == 0) - CACHE_RUNLOCK(); - return (error); -} - -static int -vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, - u_int *buflen) -{ struct vnode *dvp; struct namecache *ncp; + struct mtx *vlp; int error; + vlp = VP2VNODELOCK(*vp); + mtx_lock(vlp); TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { - CACHE_RUNLOCK(); + mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, nc_get_name(ncp), vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); - CACHE_RUNLOCK(); + mtx_unlock(vlp); vrele(dvp); - CACHE_RLOCK(); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); - CACHE_RUNLOCK(); + mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; - CACHE_RLOCK(); if (dvp->v_iflag & VI_DOOMED) { /* forced unmount */ - CACHE_RUNLOCK(); vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { int error, slash_prefixed; #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; buflen--; buf[buflen] = '\0'; error = 0; slash_prefixed = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); vref(vp); - CACHE_RLOCK(); if (vp->v_type != VDIR) { - error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); + error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) return (error); if (buflen == 0) { - CACHE_RUNLOCK(); vrele(vp); return (ENOMEM); } buf[--buflen] = '/'; slash_prefixed = 1; } while (vp != rdir && vp != rootvnode) { - if (vp->v_vflag & VV_ROOT) { - if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ - CACHE_RUNLOCK(); - vrele(vp); + /* + * The vp vnode must be already fully constructed, + * since it is either found in namecache or obtained + * from VOP_VPTOCNP(). We may test for VV_ROOT safely + * without obtaining the vnode lock. + */ + if ((vp->v_vflag & VV_ROOT) != 0) { + vn_lock(vp, LK_RETRY | LK_SHARED); + + /* + * With the vnode locked, check for races with + * unmount, forced or not. Note that we + * already verified that vp is not equal to + * the root vnode, which means that + * mnt_vnodecovered can be NULL only for the + * case of unmount. + */ + if ((vp->v_iflag & VI_DOOMED) != 0 || + (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || + vp1->v_mountedhere != vp->v_mount) { + vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } - vp1 = vp->v_mount->mnt_vnodecovered; + vref(vp1); - CACHE_RUNLOCK(); - vrele(vp); + vput(vp); vp = vp1; - CACHE_RLOCK(); continue; } if (vp->v_type != VDIR) { - CACHE_RUNLOCK(); vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } - error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); + error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) break; if (buflen == 0) { - CACHE_RUNLOCK(); vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = 1; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { - CACHE_RUNLOCK(); vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); - CACHE_RUNLOCK(); vrele(vp); SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); *retbuf = buf + buflen; return (0); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; + struct mtx *vlp; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); - CACHE_RLOCK(); + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vhold(ddvp); - CACHE_RUNLOCK(); + mtx_unlock(vlp); if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) return (NULL); return (ddvp); } - CACHE_RUNLOCK(); + mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; + struct mtx *vlp; int l; - CACHE_RLOCK(); + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { - CACHE_RUNLOCK(); + mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, nc_get_name(ncp), l); - CACHE_RUNLOCK(); + mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* ABI compat shims for old kernel modules. */ #undef cache_enter void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { cache_enter_time(dvp, vp, cnp, NULL, NULL); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If sysctl debug.disablefullpath is set, ENODEV is returned, * vnode is left locked and path remain untouched. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Return ENODEV if sysctl debug.disablefullpath==1 */ if (disablefullpath) return (ENODEV); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp, 0); error = vn_fullpath_global(td, vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE(&nd, NDF_ONLY_PNBUF); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } Index: stable/11/sys/kern/vfs_mount.c =================================================================== --- stable/11/sys/kern/vfs_mount.c (revision 310958) +++ stable/11/sys/kern/vfs_mount.c (revision 310959) @@ -1,2009 +1,2009 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ int sys_nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_activevnodelist); mp->mnt_activevnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_activevnodelistsize != 0) panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) fsflags &= ~MNT_RDONLY; else if (strcmp(opt->name, "rw") == 0) fsflags &= ~MNT_RDONLY; else if (strcmp(opt->name, "ro") == 0) fsflags |= MNT_RDONLY; else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; vfs_freeopt(optlist, opt); } } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (vfsp->vfc_vfsops->vfs_cmount == NULL) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags); return (error); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp; int error; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp, 0); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); if (error != 0) { vfs_unbusy(mp); vfs_mount_destroy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) panic("mount: lost mount"); VOP_UNLOCK(vp, 0); EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp, 0); mountcheckdirs(vp, newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; void *bufp; struct mount *mp; int error, export_error, len; uint64_t flag; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); error = EBUSY; goto end; } mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&export, sizeof(export)); /* FALLTHROUGH */ case (sizeof(export)): bcopy(bufp, &export, len); export_error = vfs_export(mp, &export); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL)) return (EPERM); } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); /* debug.disablefullpath == 1 results in ENODEV */ if (error == 0 || error == ENODEV) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(uap->flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (uap->flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0 || error == ENODEV) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, uap->flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } static void dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) { mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag &= ~mntkflags; if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vn_finished_write(mp); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp, *fsrootvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vfs_rel(mp); return (error); } vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ; if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT | MNTK_NOINSMNTQ); return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); /* * From now, we can claim that the use reference on the * coveredvp is ours, and the ref can be released only by * successfull unmount by us, or left for later unmount * attempt. The previously acquired hold reference is no * longer needed to protect the vnode from reuse. */ if (coveredvp != NULL) vdrop(coveredvp); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); - cache_purgevfs(mp); /* remove cache entries for this file sys */ + cache_purgevfs(mp, false); /* remove cache entries for this file sys */ vfs_deallocate_syncvnode(mp); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL && (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 || (error = VFS_SYNC(mp, MNT_WAIT)) == 0) error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL && (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ; if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vput(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; case 'g': case 'G': iv *= 1024; case 'm': case 'M': iv *= 1024; case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp) { bcopy(oexp, exp, sizeof(*oexp)); exp->ex_numsecflavors = 0; } Index: stable/11/sys/kern/vfs_mountroot.c =================================================================== --- stable/11/sys/kern/vfs_mountroot.c (revision 310958) +++ stable/11/sys/kern/vfs_mountroot.c (revision 310959) @@ -1,1105 +1,1105 @@ /*- * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); LIST_FOREACH(h, &root_holds, list) { if (h != LIST_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; if (root_mounted()) return (NULL); h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&root_holds_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL) return; mtx_lock(&root_holds_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); free(h, M_DEVBUF); } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("Cannot find root vnode"); VOP_UNLOCK(rootvnode, 0); p = curthread->td_proc; FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_XUNLOCK(p->p_fd); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); - cache_purgevfs(mporoot); + cache_purgevfs(mporoot, true); if (mporoot != mpdevfs) - cache_purgevfs(mpdevfs); + cache_purgevfs(mpdevfs, true); VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vporoot); vporoot->v_mountedhere = NULL; mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); - cache_purgevfs(rootvnode->v_mount); + cache_purgevfs(rootvnode->v_mount, true); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); } if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); mporoot->mnt_vnodecovered = vp; vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VOP_UNLOCK(vp, 0); } else vput(vp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); vpdevfs->v_mountedhere = NULL; vrele(vpdevfs); } mpdevfs->mnt_vnodecovered = vp; vp->v_mountedhere = mpdevfs; VOP_UNLOCK(vp, 0); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); NDFREE(&nd, NDF_ONLY_PNBUF); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_unlinkat(td, AT_FDCWD, "/dev/dev", UIO_SYSSPACE, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:tank\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); PICKUP_GIANT(); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); error = kern_close(td, fd); out: free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static int parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread); error = namei(&nd); if (!error) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error != 0) ? 0 : 1; } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int error; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; sbuf_printf(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) sbuf_printf(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); sbuf_printf(sb, ".timeout 0\n"); sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) sbuf_printf(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct timeval lastfail; int curfail; curfail = 0; while (1) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&root_holds_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); } } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0') { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } if( strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); } Index: stable/11/sys/sys/vnode.h =================================================================== --- stable/11/sys/sys/vnode.h (revision 310958) +++ stable/11/sys/sys/vnode.h (revision 310959) @@ -1,899 +1,899 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * f - freelist mutex * i - interlock * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ const char *v_tag; /* u type of underlying data */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. * See #defines below for renaming to v_* namespace. */ union { struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ struct socket *vu_socket; /* v unix domain net (VSOCK) */ struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ } v_un; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* f vnode active/free lists */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ int v_writecount; /* v ref count of writers */ u_int v_hash; enum vtype v_type; /* u vnode type */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define v_mountedhere v_un.vu_mount #define v_socket v_un.vu_socket #define v_rdev v_un.vu_cdev #define v_fifoinfo v_un.vu_fifoinfo /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* socket, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VI_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_ACTIVE 0x0200 /* This vnode is on the active list */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* filesystem id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str); #endif void assert_vop_locked(struct vnode *vp, const char *str); #if 0 voi0 assert_vop_slocked(struct vnode *vp, const char *str); #endif void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str)) #endif #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) assert_vop_slocked((vp), (str)) #endif #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) #endif #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) #endif #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(int newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_purge(struct vnode *vp); void cache_purge_negative(struct vnode *vp); -void cache_purgevfs(struct mount *mp); +void cache_purgevfs(struct mount *mp, bool force); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(u_int count); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen); #define textvp_fullpath(p, rb, rfb) \ vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb) int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); #define vdrop(vp) _vdrop((vp), 0) #define vdropl(vp) _vdrop((vp), 1) void _vdrop(struct vnode *, bool); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); #define vhold(vp) _vhold((vp), 0) #define vholdl(vp) _vhold((vp), 1) void _vhold(struct vnode *, bool); void vinactive(struct vnode *, struct thread *); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_post(void *a, int rc); void vop_deleteextattr_post(void *a, int rc); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_setextattr_post(void *a, int rc); void vop_symlink_post(void *a, int rc); #ifdef DEBUG_VFS_LOCKS void vop_strategy_pre(void *a); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); #else #define vop_strategy_pre(x) do { } while (0) #define vop_lock_pre(x) do { } while (0) #define vop_lock_post(x, y) do { } while (0) #define vop_unlock_post(x, y) do { } while (0) #define vop_unlock_pre(x) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefl(struct vnode *vp); void vrefact(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(int newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct ucred *cred); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: stable/11 =================================================================== --- stable/11 (revision 310958) +++ stable/11 (revision 310959) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r305378-305379,305386,305684,306224,306608,306803,307650,307685,308407,308665,308667,309067