Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 332526) @@ -1,2003 +1,2010 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 RackTop Systems. */ #include #include #include #include #include #include #include #include #include #include #include #include #include static kmem_cache_t *dnode_cache; /* * Define DNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef DEBUG #define DNODE_STATS #endif /* DEBUG */ #ifdef DNODE_STATS #define DNODE_STAT_ADD(stat) ((stat)++) #else #define DNODE_STAT_ADD(stat) /* nothing */ #endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, &zfs_default_bs, 0, "Default dnode block shift"); SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, &zfs_default_ibs, 0, "Default dnode indirect block shift"); #ifdef illumos +#ifdef _KERNEL static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); +#endif /* _KERNEL */ #endif static int dbuf_compare(const void *x1, const void *x2) { const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; if (d1->db_level < d2->db_level) { return (-1); } if (d1->db_level > d2->db_level) { return (1); } if (d1->db_blkid < d2->db_blkid) { return (-1); } if (d1->db_blkid > d2->db_blkid) { return (1); } if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); return (-1); } else if (d2->db_state == DB_SEARCH) { ASSERT3S(d1->db_state, !=, DB_SEARCH); return (1); } if ((uintptr_t)d1 < (uintptr_t)d2) { return (-1); } if ((uintptr_t)d1 > (uintptr_t)d2) { return (1); } return (0); } /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { dnode_t *dn = arg; int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is * O(number of references), so don't track dn_holds. */ refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dn->dn_dbufs_count = 0; avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; POINTER_INVALIDATE(&dn->dn_objset); return (0); } /* ARGSUSED */ static void dnode_dest(void *arg, void *unused) { int i; dnode_t *dn = arg; rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); } ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); ASSERT(!dn->dn_have_spill); ASSERT3P(dn->dn_zio, ==, NULL); ASSERT0(dn->dn_oldused); ASSERT0(dn->dn_oldflags); ASSERT0(dn->dn_olduid); ASSERT0(dn->dn_oldgid); ASSERT0(dn->dn_newuid); ASSERT0(dn->dn_newgid); ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_dbufs_count); avl_destroy(&dn->dn_dbufs); } void dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); +#ifdef _KERNEL kmem_cache_set_move(dnode_cache, dnode_move); +#endif /* _KERNEL */ } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } #ifdef ZFS_DEBUG void dnode_verify(dnode_t *dn) { int drop_struct_lock = FALSE; ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; ASSERT3U(dn->dn_indblkshift, >=, 0); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } #endif void dnode_byteswap(dnode_phys_t *dnp) { uint64_t *buf64 = (void*)&dnp->dn_blkptr; int i; if (dnp->dn_type == DMU_OT_NONE) { bzero(dnp, sizeof (dnode_phys_t)); return; } dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); /* * dn_nblkptr is only one byte, so it's OK to read it in either * byte order. We can't read dn_bouslen. */ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) buf64[i] = BSWAP_64(buf64[i]); /* * OK to check dn_bonuslen for zero, because it won't matter if * we have the wrong byte order. This is necessary because the * dnode dnode is smaller than a regular dnode. */ if (dnp->dn_bonuslen != 0) { /* * Note that the bonus length calculated here may be * longer than the actual bonus buffer. This is because * we always put the bonus buffer after the last block * pointer (instead of packing it against the end of the * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); } void dnode_buf_byteswap(void *vbuf, size_t size) { dnode_phys_t *buf = vbuf; int i; ASSERT3U(sizeof (dnode_phys_t), ==, (1<>= DNODE_SHIFT; for (i = 0; i < size; i++) { dnode_byteswap(buf); buf++; } } void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; else dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; rw_exit(&dn->dn_struct_rwlock); } void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dn->dn_bonustype = newtype; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; rw_exit(&dn->dn_struct_rwlock); } void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); dnode_setdirty(dn, tx); dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; dn->dn_have_spill = B_FALSE; } static void dnode_setdblksz(dnode_t *dn, int size) { ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); dn->dn_datablksz = size; dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; } static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); +#ifdef _KERNEL ASSERT(!POINTER_IS_VALID(dn->dn_objset)); +#endif /* _KERNEL */ dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate * for the dnode_move() callback. */ dn->dn_object = object; dn->dn_dbuf = db; dn->dn_handle = dnh; dn->dn_phys = dnp; if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } else { dn->dn_datablksz = 0; dn->dn_datablkszsec = 0; dn->dn_datablkshift = 0; } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; dn->dn_nblkptr = dnp->dn_nblkptr; dn->dn_checksum = dnp->dn_checksum; dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); if (dnh->dnh_dnode != NULL) { /* Lost the allocation race. */ mutex_exit(&os->os_lock); kmem_cache_free(dnode_cache, dn); return (dnh->dnh_dnode); } /* * Exclude special dnodes from os_dnodes so an empty os_dnodes * signifies that the special dnodes have no references from * their children (the entries in os_dnodes). This allows * dnode_destroy() to easily determine if the last child has * been removed and then complete eviction of the objset. */ if (!DMU_OBJECT_IS_SPECIAL(object)) list_insert_head(&os->os_dnodes, dn); membar_producer(); /* * Everything else must be valid before assigning dn_objset * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } /* * Caller must be holding the dnode handle, which is released upon return. */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { list_remove(&os->os_dnodes, dn); complete_os_eviction = list_is_empty(&os->os_dnodes) && list_link_active(&os->os_evicting_node); } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; dn->dn_have_spill = B_FALSE; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); if (complete_os_eviction) dmu_objset_evict_done(os); } void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int i; ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); if (ibs == 0) ibs = zfs_default_ibs; ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } dn->dn_type = ot; dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; else dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; if (dn->dn_dirtyctx_firstset) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); dn->dn_id_flags = 0; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { /* change blocksize */ ASSERT(dn->dn_maxblkid == 0 && (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || dnode_block_freed(dn, 0))); dnode_setdblksz(dn, blocksize); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } rw_exit(&dn->dn_struct_rwlock); /* change type */ dn->dn_type = ot; /* change bonus size and type */ mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); } #ifdef DNODE_STATS static struct { uint64_t dms_dnode_invalid; uint64_t dms_dnode_recheck1; uint64_t dms_dnode_recheck2; uint64_t dms_dnode_special; uint64_t dms_dnode_handle; uint64_t dms_dnode_rwlock; uint64_t dms_dnode_active; } dnode_move_stats; #endif /* DNODE_STATS */ +#ifdef _KERNEL static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { int i; ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); /* Copy fields. */ ndn->dn_objset = odn->dn_objset; ndn->dn_object = odn->dn_object; ndn->dn_dbuf = odn->dn_dbuf; ndn->dn_handle = odn->dn_handle; ndn->dn_phys = odn->dn_phys; ndn->dn_type = odn->dn_type; ndn->dn_bonuslen = odn->dn_bonuslen; ndn->dn_bonustype = odn->dn_bonustype; ndn->dn_nblkptr = odn->dn_nblkptr; ndn->dn_checksum = odn->dn_checksum; ndn->dn_compress = odn->dn_compress; ndn->dn_nlevels = odn->dn_nlevels; ndn->dn_indblkshift = odn->dn_indblkshift; ndn->dn_datablkshift = odn->dn_datablkshift; ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], sizeof (odn->dn_next_nlevels)); bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], sizeof (odn->dn_next_indblkshift)); bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], sizeof (odn->dn_next_bonustype)); bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], sizeof (odn->dn_rm_spillblk)); bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); } bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], sizeof (odn->dn_free_ranges)); ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_bonus = odn->dn_bonus; ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_zio = odn->dn_zio; ndn->dn_oldused = odn->dn_oldused; ndn->dn_oldflags = odn->dn_oldflags; ndn->dn_olduid = odn->dn_olduid; ndn->dn_oldgid = odn->dn_oldgid; ndn->dn_newuid = odn->dn_newuid; ndn->dn_newgid = odn->dn_newgid; ndn->dn_id_flags = odn->dn_id_flags; dmu_zfetch_init(&ndn->dn_zfetch, NULL); list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; /* * Update back pointers. Updating the handle fixes the back pointer of * every descendant dbuf as well as the bonus dbuf. */ ASSERT(ndn->dn_handle->dnh_dnode == odn); ndn->dn_handle->dnh_dnode = ndn; if (ndn->dn_zfetch.zf_dnode == odn) { ndn->dn_zfetch.zf_dnode = ndn; } /* * Invalidate the original dnode by clearing all of its back pointers. */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_bonus = NULL; odn->dn_zfetch.zf_dnode = NULL; /* * Set the low bit of the objset pointer to ensure that dnode_move() * recognizes the dnode as invalid in any subsequent callback. */ POINTER_INVALIDATE(&odn->dn_objset); /* * Satisfy the destructor. */ for (i = 0; i < TXG_SIZE; i++) { list_create(&odn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); odn->dn_free_ranges[i] = NULL; odn->dn_next_nlevels[i] = 0; odn->dn_next_indblkshift[i] = 0; odn->dn_next_bonustype[i] = 0; odn->dn_rm_spillblk[i] = 0; odn->dn_next_bonuslen[i] = 0; odn->dn_next_blksz[i] = 0; } odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; odn->dn_oldflags = 0; odn->dn_olduid = 0; odn->dn_oldgid = 0; odn->dn_newuid = 0; odn->dn_newgid = 0; odn->dn_id_flags = 0; /* * Mark the dnode. */ ndn->dn_moved = 1; odn->dn_moved = (uint8_t)-1; } #ifdef illumos -#ifdef _KERNEL /*ARGSUSED*/ static kmem_cbrc_t dnode_move(void *buf, void *newbuf, size_t size, void *arg) { dnode_t *odn = buf, *ndn = newbuf; objset_t *os; int64_t refcount; uint32_t dbufs; /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when * freeing the dnode to invalidate it, and the memory patterns written * by kmem (baddcafe and deadbeef) set at least one of the two low bits. * A newly created dnode sets the objset pointer last of all to indicate * that the dnode is known and in a valid state to be moved by this * function. */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Ensure that the objset does not go away during the move. */ rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the dnode is still valid, then so is the objset. We know that no * valid objset can be freed while we hold os_lock, so we can safely * ensure that the objset remains in use. */ mutex_enter(&os->os_lock); /* * Recheck the objset pointer in case the dnode was removed just before * acquiring the lock. */ if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold os->os_lock, the dnode * cannot be freed and fields within the dnode can be safely accessed. * The objset listing this dnode cannot go away as long as this dnode is * on its list. */ rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ /* * Lock the dnode handle to prevent the dnode from obtaining any new * holds. This also prevents the descendant dbufs and the bonus dbuf * from accessing the dnode, so that we can discount their holds. The * handle is safe to access because we know that while the dnode cannot * go away, neither can its handle. Once we hold dnh_zrlock, we can * safely move any dnode referenced only by dbufs. */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); return (KMEM_CBRC_LATER); } /* * Ensure a consistent view of the dnode's holds and the dnode's dbufs. * We need to guarantee that there is a hold for every dbuf in order to * determine whether the dnode is actively referenced. Falsely matching * a dbuf to an active hold would lead to an unsafe move. It's possible * that a thread already having an active dnode hold is about to add a * dbuf, and we can't compare hold and dbuf counts while the add is in * progress. */ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); return (KMEM_CBRC_LATER); } /* * A dbuf may be removed (evicted) without an active dnode hold. In that * case, the dbuf count is decremented under the handle lock before the * dbuf's hold is released. This order ensures that if we count the hold * after the dbuf is removed but before its hold is released, we will * treat the unmatched hold as active and exit safely. If we count the * hold before the dbuf is removed, the hold is discounted, and the * removal is blocked until the move completes. */ refcount = refcount_count(&odn->dn_holds); ASSERT(refcount >= 0); dbufs = odn->dn_dbufs_count; /* We can't have more dbufs than dnode holds. */ ASSERT3U(dbufs, <=, refcount); DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, uint32_t, dbufs); if (refcount > dbufs) { rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); return (KMEM_CBRC_LATER); } rw_exit(&odn->dn_struct_rwlock); /* * At this point we know that anyone with a hold on the dnode is not * actively referencing it. The dnode is known and in a valid state to * move. We're holding the locks needed to execute the critical section. */ dnode_move_impl(odn, ndn); list_link_replace(&odn->dn_link, &ndn->dn_link); /* If the dnode was safe to move, the refcount cannot have changed. */ ASSERT(refcount == refcount_count(&ndn->dn_holds)); ASSERT(dbufs == ndn->dn_dbufs_count); zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ mutex_exit(&os->os_lock); return (KMEM_CBRC_YES); } -#endif /* _KERNEL */ #endif /* illumos */ +#endif /* _KERNEL */ void dnode_special_close(dnode_handle_t *dnh) { dnode_t *dn = dnh->dnh_dnode; /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ while (refcount_count(&dn->dn_holds) > 0) delay(1); ASSERT(dn->dn_dbuf == NULL || dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); } static void dnode_buf_evict_async(void *dbu) { dnode_children_t *children_dnodes = dbu; int i; for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to * another valid address, so there is no need here to guard * against changes to or from NULL. */ if (dnh->dnh_dnode == NULL) { zrl_destroy(&dnh->dnh_zrlock); continue; } zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* * errors: * EINVAL - invalid object number. * EIO - i/o error. * succeeds even for free dnodes. */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, void *tag, dnode_t **dnp) { int epb, idx, err; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; dnode_children_t *children_dnodes; dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool * which may require us to read from the root filesystem while * holding some (not all) of the locks as writer. */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (SET_ERROR(ENOENT)); type = dn->dn_type; if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) return (SET_ERROR(ENOENT)); if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); (void) refcount_add(&dn->dn_holds, tag); *dnp = dn; return (0); } if (object == 0 || object >= DN_MAX_OBJECT) return (SET_ERROR(EINVAL)); mdn = DMU_META_DNODE(os); ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { rw_enter(&mdn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { dbuf_rele(db, FTAG); return (err); } ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; idx = object & (epb-1); ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); } dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, dnode_buf_evict_async, NULL); winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); if (winner != NULL) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); } kmem_free(children_dnodes, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); children_dnodes = winner; } } ASSERT(children_dnodes->dnc_count == epb); dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; dn = dnode_create(os, phys, db, object, dnh); } mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_FREE) && (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); dbuf_rele(db, FTAG); *dnp = dn; return (0); } /* * Return held dnode if the object is allocated, NULL if not. */ int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } /* * Can only add a reference if there is already at least one * reference on the dnode. Returns FALSE if unable to add a * new reference. */ boolean_t dnode_add_ref(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); if (refcount_is_zero(&dn->dn_holds)) { mutex_exit(&dn->dn_mtx); return (FALSE); } VERIFY(1 < refcount_add(&dn->dn_holds, tag)); mutex_exit(&dn->dn_mtx); return (TRUE); } void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, tag); } void dnode_rele_and_unlock(dnode_t *dn, void *tag) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ dmu_buf_impl_t *db = dn->dn_dbuf; dnode_handle_t *dnh = dn->dn_handle; refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); /* * It's unsafe to release the last hold on a dnode by dnode_rele() or * indirectly by dbuf_rele() while relying on the dnode handle to * prevent the dnode from moving, since releasing the last hold could * result in the dnode's parent dbuf evicting its dnode handles. For * that reason anyone calling dnode_rele() or dbuf_rele() without some * other direct or indirect hold on the dnode must first drop the dnode * handle. */ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && db != NULL) { /* * Another thread could add a hold to the dnode handle in * dnode_hold_impl() while holding the parent dbuf. Since the * hold on the parent dbuf prevents the handle from being * destroyed, the hold on the handle is OK. We can't yet assert * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ dbuf_rele(db, dnh); } } void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { dsl_dataset_dirty(os->os_dsl_dataset, tx); return; } DNODE_VERIFY(dn); #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif /* * Determine old uid/gid when necessary */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); /* * If we are already marked dirty, we're done. */ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { multilist_sublist_unlock(mls); return; } ASSERT(!refcount_is_zero(&dn->dn_holds) || !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); multilist_sublist_insert_head(mls, dn); multilist_sublist_unlock(mls); /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its * children. */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } void dnode_free(dnode_t *dn, dmu_tx_t *tx) { mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { mutex_exit(&dn->dn_mtx); return; } dn->dn_free_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); } /* * Try to change the block size for the indicated dnode. This can only * succeed if there are no blocks allocated or dirty beyond first block */ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db; int err; ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); if (ibs == dn->dn_indblkshift) ibs = 0; if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* Check for any allocated blocks beyond the first */ if (dn->dn_maxblkid != 0) goto fail; mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } } mutex_exit(&dn->dn_dbufs_mtx); if (ibs && dn->dn_nlevels != 1) goto fail; /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; if (ibs) { dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); fail: rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(ENOTSUP)); } /* read-holding callers must not rely on the lock being continuously held */ void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) { uint64_t txgoff = tx->tx_txg & TXG_MASK; int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * if we have a read-lock, check to see if we need to do any work * before upgrading to a write-lock. */ if (have_read) { if (blkid <= dn->dn_maxblkid) return; if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } } if (blkid <= dn->dn_maxblkid) goto out; dn->dn_maxblkid = blkid; /* * Compute the number of levels necessary to support the new maxblkid. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; if (new_nlevels > dn->dn_nlevels) { int old_nlevels = dn->dn_nlevels; dmu_buf_impl_t *db; list_t *list; dbuf_dirty_record_t *new, *dr, *dr_next; dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); dn->dn_next_nlevels[txgoff] = new_nlevels; /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ mutex_enter(&dn->dn_mtx); mutex_enter(&new->dt.di.dr_mtx); list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; } } mutex_exit(&new->dt.di.dr_mtx); mutex_exit(&dn->dn_mtx); } out: if (have_read) rw_downgrade(&dn->dn_struct_rwlock); } static void dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); if (db != NULL) { dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } } void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; int blksz, blkshift, head, tail; int trunc = FALSE; int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } /* * First, block align the region to free: */ if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. * Note that we won't dirty any indirect blocks, * which is fine because we will be freeing the entire * file and thus all indirect blocks will be freed * by free_children(). */ blkid = 0; nblks = 1; goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ goto out; } else { /* Freeing part of the block. */ head = blksz - off; ASSERT3U(head, >, 0); } blkoff = off; } /* zero out any partial block data at the start of the range */ if (head) { ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); } dbuf_rele(db, FTAG); } off += head; len -= head; } /* If the range was less than one block, we're done */ if (len == 0) goto out; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) goto out; ASSERT(ISP2(blksz)); if (trunc) tail = 0; else tail = P2PHASE(len, blksz); ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { if (len < tail) tail = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } len -= tail; } /* If the range did not include a full block, we are done */ if (len == 0) goto out; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); blkid = off >> blkshift; nblks = len >> blkshift; if (trunc) nblks += 1; /* * Dirty all the indirect blocks in this range. Note that only * the first and last indirect blocks can actually be written * (if they were partially freed) -- they must be dirtied, even if * they do not exist on disk yet. The interior blocks will * be freed by free_children(), so they will not actually be written. * Even though these interior blocks will not be written, we * dirty them for two reasons: * * - It ensures that the indirect blocks remain in memory until * syncing context. (They have already been prefetched by * dmu_tx_hold_free(), so we don't have to worry about reading * them serially here.) * * - The dirty space accounting will put pressure on the txg sync * mechanism to begin syncing, and to delay transactions if there * is a large amount of freeing. Even though these indirect * blocks will not be written, we could need to write the same * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { uint64_t first, last; first = blkid >> epbs; dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; if (last != first) dnode_dirty_l1(dn, last, tx); int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { /* * Set i to the blockid of the next non-hole * level-1 indirect block at or after i. Note * that dnode_next_offset() operates in terms of * level-0-equivalent bytes. */ uint64_t ibyte = i << shift; int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); i = ibyte >> shift; if (i >= last) break; /* * Normally we should not see an error, either * from dnode_next_offset() or dbuf_hold_level() * (except for ESRCH from dnode_next_offset). * If there is an i/o error, then when we read * this block in syncing context, it will use * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according * to the "failmode" property. dnode_next_offset() * doesn't have a flag to indicate MUSTSUCCEED. */ if (err != 0) break; dnode_dirty_l1(dn, i, tx); } } done: /* * Add this range to the dnode range list. * We will finish up this free operation in the syncing phase. */ mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", blkid, nblks, tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: rw_exit(&dn->dn_struct_rwlock); } static boolean_t dnode_spill_freed(dnode_t *dn) { int i; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; if (blkid == DMU_BONUS_BLKID) return (FALSE); /* * If we're in the process of opening the pool, dp will not be * set yet, but there shouldn't be anything dirty. */ if (dp == NULL) return (FALSE); if (dn->dn_free_txg) return (TRUE); if (blkid == DMU_SPILL_BLKID) return (dnode_spill_freed(dn)); mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_free_ranges[i] != NULL && range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* call from syncing context when we actually write/free space for this dnode */ void dnode_diduse_space(dnode_t *dn, int64_t delta) { uint64_t space; dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", dn, dn->dn_phys, (u_longlong_t)dn->dn_phys->dn_used, (longlong_t)delta); mutex_enter(&dn->dn_mtx); space = DN_USED_BYTES(dn->dn_phys); if (delta > 0) { ASSERT3U(space + delta, >=, space); /* no overflow */ } else { ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; } else { dn->dn_phys->dn_used = space; dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; } mutex_exit(&dn->dn_mtx); } /* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * * If level > 0, then we are scanning an indirect block looking at its * pointers. If level == 0, then we are looking at a block of dnodes. * * If we don't find what we are looking for in the block, we return ESRCH. * Otherwise, return with *offset pointing to the beginning (if searching * forwards) or end (if searching backwards) of the range covered by the * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until * we find something (i.e., we don't return ESRCH) and then search back * down the tree (narrow the search) until we reach our original search * level. */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; boolean_t hole; int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); if (hole) return (0); /* * This can only happen when we are searching up * the block tree for data. We don't really need to * adjust the offset, as we will just end up looking * at the pointer to this block in its parent, and its * going to be unallocated, so we will skip over it. */ return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { dbuf_rele(db, FTAG); return (error); } data = db->db.db_data; } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. */ error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; span = DNODE_SHIFT; ASSERT(dn->dn_type == DMU_OT_DNODE); for (i = (*offset >> span) & (blkfill - 1); i >= 0 && i < blkfill; i += inc) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; *offset += (1ULL << span) * inc; } if (i < 0 || i == blkfill) error = SET_ERROR(ESRCH); } else { blkptr_t *bp = data; uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); if (hole) maxfill--; else minfill++; *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; } *offset = *offset << span; if (inc < 0) { /* traversing backwards; position offset at the end */ ASSERT3U(*offset, <=, start); *offset = MIN(*offset + (1ULL << span) - 1, start); } else if (*offset < start) { *offset = start; } if (i < 0 || i >= epb) error = SET_ERROR(ESRCH); } if (db) dbuf_rele(db, FTAG); return (error); } /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, * DNODES_PER_BLOCK for the meta dnode, and some fraction of * DNODES_PER_BLOCK when searching for sparse regions thereof. * * Examples: * * dnode_next_offset(dn, flags, offset, 1, 1, 0); * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; if (!(flags & DNODE_FIND_HAVELOCK)) rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { error = SET_ERROR(ESRCH); goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = SET_ERROR(ESRCH); } goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); } /* * There's always a "virtual hole" at the end of the object, even * if all BP's which physically exist are non-holes. */ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { error = 0; } if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? initial_offset < *offset : initial_offset > *offset)) error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); return (error); } Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h (revision 332526) @@ -1,312 +1,313 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2014 Garrett D'Amore * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 RackTop Systems. */ #ifndef _SYS_ACL_H #define _SYS_ACL_H #include #include #if defined(_KERNEL) /* * When compiling OpenSolaris kernel code, this file is included instead of the * FreeBSD one. Include the original sys/acl.h as well. */ #undef _SYS_ACL_H #include_next #define _SYS_ACL_H #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif #define MAX_ACL_ENTRIES (1024) /* max entries of each type */ typedef struct { int a_type; /* the type of ACL entry */ uid_t a_id; /* the entry in -uid or gid */ o_mode_t a_perm; /* the permission field */ } aclent_t; typedef struct ace { uid_t a_who; /* uid or gid */ uint32_t a_access_mask; /* read,write,... */ uint16_t a_flags; /* see below */ uint16_t a_type; /* allow or deny */ } ace_t; #ifndef _KERNEL typedef struct acl_info acl_t; #endif /* * The following are Defined types for an aclent_t. */ #define USER_OBJ (0x01) /* object owner */ #define USER (0x02) /* additional users */ #define GROUP_OBJ (0x04) /* owning group of the object */ #define GROUP (0x08) /* additional groups */ #define CLASS_OBJ (0x10) /* file group class and mask entry */ #define OTHER_OBJ (0x20) /* other entry for the object */ #define ACL_DEFAULT (0x1000) /* default flag */ /* default object owner */ #define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) /* default additional users */ #define DEF_USER (ACL_DEFAULT | USER) /* default owning group */ #define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) /* default additional groups */ #define DEF_GROUP (ACL_DEFAULT | GROUP) /* default mask entry */ #define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) /* default other entry */ #define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) /* * The following are defined for ace_t. */ #define ACE_READ_DATA 0x00000001 #define ACE_LIST_DIRECTORY 0x00000001 #define ACE_WRITE_DATA 0x00000002 #define ACE_ADD_FILE 0x00000002 #define ACE_APPEND_DATA 0x00000004 #define ACE_ADD_SUBDIRECTORY 0x00000004 #define ACE_READ_NAMED_ATTRS 0x00000008 #define ACE_WRITE_NAMED_ATTRS 0x00000010 #define ACE_EXECUTE 0x00000020 #define ACE_DELETE_CHILD 0x00000040 #define ACE_READ_ATTRIBUTES 0x00000080 #define ACE_WRITE_ATTRIBUTES 0x00000100 #define ACE_DELETE 0x00010000 #define ACE_READ_ACL 0x00020000 #define ACE_WRITE_ACL 0x00040000 #define ACE_WRITE_OWNER 0x00080000 #define ACE_SYNCHRONIZE 0x00100000 #define ACE_FILE_INHERIT_ACE 0x0001 #define ACE_DIRECTORY_INHERIT_ACE 0x0002 #define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 #define ACE_INHERIT_ONLY_ACE 0x0008 #define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 #define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 #define ACE_IDENTIFIER_GROUP 0x0040 #define ACE_INHERITED_ACE 0x0080 #define ACE_OWNER 0x1000 #define ACE_GROUP 0x2000 #define ACE_EVERYONE 0x4000 #define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 #define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 #define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 #define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 #define ACL_AUTO_INHERIT 0x0001 #define ACL_PROTECTED 0x0002 #define ACL_DEFAULTED 0x0004 #define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ ACL_DEFAULTED) -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_FAKE_KERNEL) /* * These are only applicable in a CIFS context. */ #define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 #define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 #define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 #define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 #define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 #define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 #define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A #define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B #define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C #define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D #define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E #define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F #define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 #define ACE_ALL_TYPES 0x001F typedef struct ace_object { uid_t a_who; /* uid or gid */ uint32_t a_access_mask; /* read,write,... */ uint16_t a_flags; /* see below */ uint16_t a_type; /* allow or deny */ uint8_t a_obj_type[16]; /* obj type */ uint8_t a_inherit_obj_type[16]; /* inherit obj */ } ace_object_t; #endif #define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ ACE_WRITE_OWNER|ACE_SYNCHRONIZE) #define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) #define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ ACE_READ_NAMED_ATTRS) #define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS) #define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) /* * The following flags are supported by both NFSv4 ACLs and ace_t. */ #define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ ACE_DIRECTORY_INHERIT_ACE | \ ACE_NO_PROPAGATE_INHERIT_ACE | \ ACE_INHERIT_ONLY_ACE | \ ACE_INHERITED_ACE | \ ACE_IDENTIFIER_GROUP) #define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ ACE_IDENTIFIER_GROUP) #define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) /* cmd args to acl(2) for aclent_t */ #define GETACL 1 #define SETACL 2 #define GETACLCNT 3 /* cmd's to manipulate ace acls. */ #define ACE_GETACL 4 #define ACE_SETACL 5 #define ACE_GETACLCNT 6 /* minimal acl entries from GETACLCNT */ #define MIN_ACL_ENTRIES 4 #if !defined(_KERNEL) /* acl check errors */ #define GRP_ERROR 1 #define USER_ERROR 2 #define OTHER_ERROR 3 #define CLASS_ERROR 4 #define DUPLICATE_ERROR 5 #define MISS_ERROR 6 #define MEM_ERROR 7 #define ENTRY_ERROR 8 /* * similar to ufs_acl.h: changed to char type for user commands (tar, cpio) * Attribute types */ #define UFSD_FREE ('0') /* Free entry */ #define UFSD_ACL ('1') /* Access Control Lists */ #define UFSD_DFACL ('2') /* reserved for future use */ #define ACE_ACL ('3') /* ace_t style acls */ /* * flag to [f]acl_get() * controls whether a trivial acl should be returned. */ #define ACL_NO_TRIVIAL 0x2 /* * Flags to control acl_totext() */ #define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */ #define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */ #define ACL_NORESOLVE 0x4 /* don't do name service lookups */ #define ACL_SID_FMT 0x8 /* use usersid/groupsid when appropriate */ /* * Legacy aclcheck errors for aclent_t ACLs */ #define EACL_GRP_ERROR GRP_ERROR #define EACL_USER_ERROR USER_ERROR #define EACL_OTHER_ERROR OTHER_ERROR #define EACL_CLASS_ERROR CLASS_ERROR #define EACL_DUPLICATE_ERROR DUPLICATE_ERROR #define EACL_MISS_ERROR MISS_ERROR #define EACL_MEM_ERROR MEM_ERROR #define EACL_ENTRY_ERROR ENTRY_ERROR #define EACL_INHERIT_ERROR 9 /* invalid inherit flags */ #define EACL_FLAGS_ERROR 10 /* unknown flag value */ #define EACL_PERM_MASK_ERROR 11 /* unknown permission */ #define EACL_COUNT_ERROR 12 /* invalid acl count */ #define EACL_INVALID_SLOT 13 /* invalid acl slot */ #define EACL_NO_ACL_ENTRY 14 /* Entry doesn't exist */ #define EACL_DIFF_TYPE 15 /* acls aren't same type */ #define EACL_INVALID_USER_GROUP 16 /* need user/group name */ #define EACL_INVALID_STR 17 /* invalid acl string */ #define EACL_FIELD_NOT_BLANK 18 /* can't have blank field */ #define EACL_INVALID_ACCESS_TYPE 19 /* invalid access type */ #define EACL_UNKNOWN_DATA 20 /* Unrecognized data in ACL */ #define EACL_MISSING_FIELDS 21 /* missing fields in acl */ #define EACL_INHERIT_NOTDIR 22 /* Need dir for inheritance */ extern int aclcheck(aclent_t *, int, int *); extern int acltomode(aclent_t *, int, mode_t *); extern int aclfrommode(aclent_t *, int, mode_t *); extern int aclsort(int, int, aclent_t *); extern char *acltotext(aclent_t *, int); extern aclent_t *aclfromtext(char *, int *); extern void acl_free(acl_t *); extern int acl_get(const char *, int, acl_t **); extern int facl_get(int, int, acl_t **); extern int acl_set(const char *, acl_t *acl); extern int facl_set(int, acl_t *acl); extern int acl_strip(const char *, uid_t, gid_t, mode_t); extern int acl_trivial(const char *); extern char *acl_totext(acl_t *, int); extern int acl_fromtext(const char *, acl_t **); extern int acl_check(acl_t *, int); #else /* !defined(_KERNEL) */ extern void ksort(caddr_t, int, int, int (*)(void *, void *)); extern int cmp2acls(void *, void *); #endif /* !defined(_KERNEL) */ extern int acl(const char *path, int cmd, int cnt, void *buf); extern int facl(int fd, int cmd, int cnt, void *buf); #ifdef __cplusplus } #endif #endif /* _SYS_ACL_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h (revision 332526) @@ -1,197 +1,198 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2017 RackTop Systems. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ #ifndef _SYS_BITMAP_H #define _SYS_BITMAP_H #ifdef __cplusplus extern "C" { #endif #include #if defined(__GNUC__) && defined(_ASM_INLINES) && \ (defined(__i386) || defined(__amd64)) #include #endif /* * Operations on bitmaps of arbitrary size * A bitmap is a vector of 1 or more ulong_t's. * The user of the package is responsible for range checks and keeping * track of sizes. */ #ifdef _LP64 #define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ #define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ #else #define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ #endif #define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ #define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ #ifdef _LP64 #define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */ #define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */ #define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */ #else #define BT_ULMAXMASK 0xffffffff #endif /* * bitmap is a ulong_t *, bitindex an index_t * * The macros BT_WIM and BT_BIW internal; there is no need * for users of this package to use them. */ /* * word in map */ #define BT_WIM(bitmap, bitindex) \ ((bitmap)[(bitindex) >> BT_ULSHIFT]) /* * bit in word */ #define BT_BIW(bitindex) \ (1UL << ((bitindex) & BT_ULMASK)) #ifdef _LP64 #define BT_WIM32(bitmap, bitindex) \ ((bitmap)[(bitindex) >> BT_ULSHIFT32]) #define BT_BIW32(bitindex) \ (1UL << ((bitindex) & BT_ULMASK32)) #endif /* * These are public macros * * BT_BITOUL == n bits to n ulong_t's */ #define BT_BITOUL(nbits) \ (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) #define BT_SIZEOFMAP(nbits) \ (BT_BITOUL(nbits) * sizeof (ulong_t)) #define BT_TEST(bitmap, bitindex) \ ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) #define BT_SET(bitmap, bitindex) \ { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } #define BT_CLEAR(bitmap, bitindex) \ { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } #ifdef _LP64 #define BT_BITOUL32(nbits) \ (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32) #define BT_SIZEOFMAP32(nbits) \ (BT_BITOUL32(nbits) * sizeof (uint_t)) #define BT_TEST32(bitmap, bitindex) \ ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0) #define BT_SET32(bitmap, bitindex) \ { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); } #define BT_CLEAR32(bitmap, bitindex) \ { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); } #endif /* _LP64 */ /* * BIT_ONLYONESET is a private macro not designed for bitmaps of * arbitrary size. u must be an unsigned integer/long. It returns * true if one and only one bit is set in u. */ #define BIT_ONLYONESET(u) \ ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0)) -#if defined(_KERNEL) && !defined(_ASM) +#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_ASM) #include /* * return next available bit index from map with specified number of bits */ extern index_t bt_availbit(ulong_t *bitmap, size_t nbits); /* * find the highest order bit that is on, and is within or below * the word specified by wx */ extern int bt_gethighbit(ulong_t *mapp, int wx); extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos); /* * Find highest and lowest one bit set. * Returns bit number + 1 of bit that is set, otherwise returns 0. * Low order bit is 0, high order bit is 31. */ extern int highbit(ulong_t); extern int highbit64(uint64_t); extern int lowbit(ulong_t); extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop); extern void bt_copy(ulong_t *, ulong_t *, ulong_t); /* * find the parity */ extern int odd_parity(ulong_t); /* * Atomically set/clear bits * Atomic exclusive operations will set "result" to "-1" * if the bit is already set/cleared. "result" will be set * to 0 otherwise. */ #define BT_ATOMIC_SET(bitmap, bitindex) \ { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); } #define BT_ATOMIC_CLEAR(bitmap, bitindex) \ { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); } #define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \ { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \ (bitindex) % BT_NBIPUL); } #define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \ { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \ (bitindex) % BT_NBIPUL); } /* * Extracts bits between index h (high, inclusive) and l (low, exclusive) from * u, which must be an unsigned integer. */ #define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU)) -#endif /* _KERNEL && !_ASM */ +#endif /* (_KERNEL || _FAKE_KERNEL) && !_ASM */ #ifdef __cplusplus } #endif #endif /* _SYS_BITMAP_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h (revision 332526) @@ -1,157 +1,158 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. */ #ifndef _SYS_CPUPART_H #define _SYS_CPUPART_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_FAKE_KERNEL) typedef int cpupartid_t; /* * Special partition id. */ #define CP_DEFAULT 0 /* * Flags for cpupart_list() */ #define CP_ALL 0 /* return all cpu partitions */ #define CP_NONEMPTY 1 /* return only non-empty ones */ typedef struct cpupart { disp_t cp_kp_queue; /* partition-wide kpreempt queue */ cpupartid_t cp_id; /* partition ID */ int cp_ncpus; /* number of online processors */ struct cpupart *cp_next; /* next partition in list */ struct cpupart *cp_prev; /* previous partition in list */ struct cpu *cp_cpulist; /* processor list */ struct kstat *cp_kstat; /* per-partition statistics */ /* * cp_nrunnable and cp_nrunning are used to calculate load average. */ uint_t cp_nrunnable; /* current # of runnable threads */ uint_t cp_nrunning; /* current # of running threads */ /* * cp_updates, cp_nrunnable_cum, cp_nwaiting_cum, and cp_hp_avenrun * are used to generate kstat information on an as-needed basis. */ uint64_t cp_updates; /* number of statistics updates */ uint64_t cp_nrunnable_cum; /* cum. # of runnable threads */ uint64_t cp_nwaiting_cum; /* cum. # of waiting threads */ struct loadavg_s cp_loadavg; /* cpupart loadavg */ klgrpset_t cp_lgrpset; /* set of lgroups on which this */ /* partition has cpus */ lpl_t *cp_lgrploads; /* table of load averages for this */ /* partition, indexed by lgrp ID */ int cp_nlgrploads; /* size of cp_lgrploads table */ uint64_t cp_hp_avenrun[3]; /* high-precision load average */ uint_t cp_attr; /* bitmask of attributes */ lgrp_gen_t cp_gen; /* generation number */ lgrp_id_t cp_lgrp_hint; /* last home lgroup chosen */ bitset_t cp_cmt_pgs; /* CMT PGs represented */ bitset_t cp_haltset; /* halted CPUs */ } cpupart_t; typedef struct cpupart_kstat { kstat_named_t cpk_updates; /* number of updates */ kstat_named_t cpk_runnable; /* cum # of runnable threads */ kstat_named_t cpk_waiting; /* cum # waiting for I/O */ kstat_named_t cpk_ncpus; /* current # of CPUs */ kstat_named_t cpk_avenrun_1min; /* 1-minute load average */ kstat_named_t cpk_avenrun_5min; /* 5-minute load average */ kstat_named_t cpk_avenrun_15min; /* 15-minute load average */ } cpupart_kstat_t; /* * Macro to obtain the maximum run priority for the global queue associated * with given cpu partition. */ #define CP_MAXRUNPRI(cp) ((cp)->cp_kp_queue.disp_maxrunpri) /* * This macro is used to determine if the given thread must surrender * CPU to higher priority runnable threads on one of its dispatch queues. * This should really be defined in but it is not because * including there would cause recursive includes. */ #define DISP_MUST_SURRENDER(t) \ ((DISP_MAXRUNPRI(t) > DISP_PRIO(t)) || \ (CP_MAXRUNPRI(t->t_cpupart) > DISP_PRIO(t))) extern cpupart_t cp_default; extern cpupart_t *cp_list_head; extern uint_t cp_numparts; extern uint_t cp_numparts_nonempty; /* * Each partition contains a bitset that indicates which CPUs are halted and * which ones are running. Given the growing number of CPUs in current and * future platforms, it's important to fanout each CPU within its partition's * haltset to prevent contention due to false sharing. The fanout factor * is platform specific, and declared accordingly. */ extern uint_t cp_haltset_fanout; extern void cpupart_initialize_default(); extern cpupart_t *cpupart_find(psetid_t); extern int cpupart_create(psetid_t *); extern int cpupart_destroy(psetid_t); extern psetid_t cpupart_query_cpu(cpu_t *); extern int cpupart_attach_cpu(psetid_t, cpu_t *, int); extern int cpupart_get_cpus(psetid_t *, processorid_t *, uint_t *); extern int cpupart_bind_thread(kthread_id_t, psetid_t, int, void *, void *); extern void cpupart_kpqalloc(pri_t); extern int cpupart_get_loadavg(psetid_t, int *, int); extern uint_t cpupart_list(psetid_t *, uint_t, int); extern int cpupart_setattr(psetid_t, uint_t); extern int cpupart_getattr(psetid_t, uint_t *); -#endif /* _KERNEL */ +#endif /* _KERNEL || _FAKE_KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_CPUPART_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h (revision 332526) @@ -1,828 +1,830 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Igor Kozhukhov . + * Copyright 2017 RackTop Systems. */ #ifndef _SYS_CPUVAR_H #define _SYS_CPUVAR_H #include #include /* has cpu_stat_t definition */ #include #include +#include /* has kcpc_ctx_t definition */ #include #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) #include #endif #include #include #include #include #include #if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL) && \ (defined(__i386) || defined(__amd64)) #include #endif #ifdef __cplusplus extern "C" { #endif struct squeue_set_s; #define CPU_CACHE_COHERENCE_SIZE 64 /* * For fast event tracing. */ struct ftrace_record; typedef struct ftrace_data { int ftd_state; /* ftrace flags */ kmutex_t ftd_unused; /* ftrace buffer lock, unused */ struct ftrace_record *ftd_cur; /* current record */ struct ftrace_record *ftd_first; /* first record */ struct ftrace_record *ftd_last; /* last record */ } ftrace_data_t; struct cyc_cpu; struct nvlist; /* * Per-CPU data. * * Be careful adding new members: if they are not the same in all modules (e.g. * change size depending on a #define), CTF uniquification can fail to work * properly. Furthermore, this is transitive in that it applies recursively to * all types pointed to by cpu_t. */ typedef struct cpu { processorid_t cpu_id; /* CPU number */ processorid_t cpu_seqid; /* sequential CPU id (0..ncpus-1) */ volatile cpu_flag_t cpu_flags; /* flags indicating CPU state */ struct cpu *cpu_self; /* pointer to itself */ kthread_t *cpu_thread; /* current thread */ kthread_t *cpu_idle_thread; /* idle thread for this CPU */ kthread_t *cpu_pause_thread; /* pause thread for this CPU */ klwp_id_t cpu_lwp; /* current lwp (if any) */ klwp_id_t cpu_fpowner; /* currently loaded fpu owner */ struct cpupart *cpu_part; /* partition with this CPU */ struct lgrp_ld *cpu_lpl; /* pointer to this cpu's load */ int cpu_cache_offset; /* see kmem.c for details */ /* * Links to other CPUs. It is safe to walk these lists if * one of the following is true: * - cpu_lock held * - preemption disabled via kpreempt_disable * - PIL >= DISP_LEVEL * - acting thread is an interrupt thread * - all other CPUs are paused */ struct cpu *cpu_next; /* next existing CPU */ struct cpu *cpu_prev; /* prev existing CPU */ struct cpu *cpu_next_onln; /* next online (enabled) CPU */ struct cpu *cpu_prev_onln; /* prev online (enabled) CPU */ struct cpu *cpu_next_part; /* next CPU in partition */ struct cpu *cpu_prev_part; /* prev CPU in partition */ struct cpu *cpu_next_lgrp; /* next CPU in latency group */ struct cpu *cpu_prev_lgrp; /* prev CPU in latency group */ struct cpu *cpu_next_lpl; /* next CPU in lgrp partition */ struct cpu *cpu_prev_lpl; struct cpu_pg *cpu_pg; /* cpu's processor groups */ void *cpu_reserved[4]; /* reserved for future use */ /* * Scheduling variables. */ disp_t *cpu_disp; /* dispatch queue data */ /* * Note that cpu_disp is set before the CPU is added to the system * and is never modified. Hence, no additional locking is needed * beyond what's necessary to access the cpu_t structure. */ char cpu_runrun; /* scheduling flag - set to preempt */ char cpu_kprunrun; /* force kernel preemption */ pri_t cpu_chosen_level; /* priority at which cpu */ /* was chosen for scheduling */ kthread_t *cpu_dispthread; /* thread selected for dispatch */ disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */ uint8_t cpu_disp_flags; /* flags used by dispatcher */ /* * The following field is updated when ever the cpu_dispthread * changes. Also in places, where the current thread(cpu_dispthread) * priority changes. This is used in disp_lowpri_cpu() */ pri_t cpu_dispatch_pri; /* priority of cpu_dispthread */ clock_t cpu_last_swtch; /* last time switched to new thread */ /* * Interrupt data. */ caddr_t cpu_intr_stack; /* interrupt stack */ kthread_t *cpu_intr_thread; /* interrupt thread list */ uint_t cpu_intr_actv; /* interrupt levels active (bitmask) */ int cpu_base_spl; /* priority for highest rupt active */ /* * Statistics. */ cpu_stats_t cpu_stats; /* per-CPU statistics */ struct kstat *cpu_info_kstat; /* kstat for cpu info */ uintptr_t cpu_profile_pc; /* kernel PC in profile interrupt */ uintptr_t cpu_profile_upc; /* user PC in profile interrupt */ uintptr_t cpu_profile_pil; /* PIL when profile interrupted */ ftrace_data_t cpu_ftrace; /* per cpu ftrace data */ clock_t cpu_deadman_counter; /* used by deadman() */ uint_t cpu_deadman_countdown; /* used by deadman() */ kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */ kcpc_ctx_t *cpu_cpc_ctx; /* performance counter context */ /* * Configuration information for the processor_info system call. */ processor_info_t cpu_type_info; /* config info */ time_t cpu_state_begin; /* when CPU entered current state */ char cpu_cpr_flags; /* CPR related info */ struct cyc_cpu *cpu_cyclic; /* per cpu cyclic subsystem data */ struct squeue_set_s *cpu_squeue_set; /* per cpu squeue set */ struct nvlist *cpu_props; /* pool-related properties */ krwlock_t cpu_ft_lock; /* DTrace: fasttrap lock */ uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ volatile uint16_t cpu_mstate; /* cpu microstate */ volatile uint16_t cpu_mstate_gen; /* generation counter */ volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */ volatile hrtime_t cpu_acct[NCMSTATES]; /* cpu microstate data */ hrtime_t cpu_intracct[NCMSTATES]; /* interrupt mstate data */ hrtime_t cpu_waitrq; /* cpu run-queue wait time */ struct loadavg_s cpu_loadavg; /* loadavg info for this cpu */ char *cpu_idstr; /* for printing and debugging */ char *cpu_brandstr; /* for printing */ /* * Sum of all device interrupt weights that are currently directed at * this cpu. Cleared at start of interrupt redistribution. */ int32_t cpu_intr_weight; void *cpu_vm_data; struct cpu_physid *cpu_physid; /* physical associations */ uint64_t cpu_curr_clock; /* current clock freq in Hz */ char *cpu_supp_freqs; /* supported freqs in Hz */ uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */ uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */ /* * Interrupt load factor used by dispatcher & softcall */ hrtime_t cpu_intrlast; /* total interrupt time (nsec) */ int cpu_intrload; /* interrupt load factor (0-99%) */ uint_t cpu_rotor; /* for cheap pseudo-random numbers */ struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */ /* * cpu_generation is updated whenever CPU goes on-line or off-line. * Updates to cpu_generation are protected by cpu_lock. * * See CPU_NEW_GENERATION() macro below. */ volatile uint_t cpu_generation; /* tracking on/off-line */ /* * New members must be added /before/ this member, as the CTF tools * rely on this being the last field before cpu_m, so they can * correctly calculate the offset when synthetically adding the cpu_m * member in objects that do not have it. This fixup is required for * uniquification to work correctly. */ uintptr_t cpu_m_pad; #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) struct machcpu cpu_m; /* per architecture info */ #endif } cpu_t; /* * The cpu_core structure consists of per-CPU state available in any context. * On some architectures, this may mean that the page(s) containing the * NCPU-sized array of cpu_core structures must be locked in the TLB -- it * is up to the platform to assure that this is performed properly. Note that * the structure is sized to avoid false sharing. */ #define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \ sizeof (uintptr_t) + sizeof (kmutex_t)) #define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE typedef struct cpu_core { uint16_t cpuc_dtrace_flags; /* DTrace flags */ uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */ uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */ uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */ kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */ } cpu_core_t; #ifdef _KERNEL extern cpu_core_t cpu_core[]; #endif /* _KERNEL */ /* * CPU_ON_INTR() macro. Returns non-zero if currently on interrupt stack. * Note that this isn't a test for a high PIL. For example, cpu_intr_actv * does not get updated when we go through sys_trap from TL>0 at high PIL. * getpil() should be used instead to check for PIL levels. */ #define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1)) /* * Check to see if an interrupt thread might be active at a given ipl. * If so return true. * We must be conservative--it is ok to give a false yes, but a false no * will cause disaster. (But if the situation changes after we check it is * ok--the caller is trying to ensure that an interrupt routine has been * exited). * This is used when trying to remove an interrupt handler from an autovector * list in avintr.c. */ #define INTR_ACTIVE(cpup, level) \ ((level) <= LOCK_LEVEL ? \ ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup))) /* * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one * looks at it. It's meant as a cheap mechanism to be incorporated in routines * wanting to avoid biasing, but where true randomness isn't needed (just * something that changes). */ #define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++) #if defined(_KERNEL) || defined(_KMEMUSER) #define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE) /* MEMBERS PROTECTED BY "atomicity": cpu_flags */ /* * Flags in the CPU structure. * * These are protected by cpu_lock (except during creation). * * Offlined-CPUs have three stages of being offline: * * CPU_ENABLE indicates that the CPU is participating in I/O interrupts * that can be directed at a number of different CPUs. If CPU_ENABLE * is off, the CPU will not be given interrupts that can be sent elsewhere, * but will still get interrupts from devices associated with that CPU only, * and from other CPUs. * * CPU_OFFLINE indicates that the dispatcher should not allow any threads * other than interrupt threads to run on that CPU. A CPU will not have * CPU_OFFLINE set if there are any bound threads (besides interrupts). * * CPU_QUIESCED is set if p_offline was able to completely turn idle the * CPU and it will not have to run interrupt threads. In this case it'll * stay in the idle loop until CPU_QUIESCED is turned off. * * CPU_FROZEN is used only by CPR to mark CPUs that have been successfully * suspended (in the suspend path), or have yet to be resumed (in the resume * case). * * On some platforms CPUs can be individually powered off. * The following flags are set for powered off CPUs: CPU_QUIESCED, * CPU_OFFLINE, and CPU_POWEROFF. The following flags are cleared: * CPU_RUNNING, CPU_READY, CPU_EXISTS, and CPU_ENABLE. */ #define CPU_RUNNING 0x001 /* CPU running */ #define CPU_READY 0x002 /* CPU ready for cross-calls */ #define CPU_QUIESCED 0x004 /* CPU will stay in idle */ #define CPU_EXISTS 0x008 /* CPU is configured */ #define CPU_ENABLE 0x010 /* CPU enabled for interrupts */ #define CPU_OFFLINE 0x020 /* CPU offline via p_online */ #define CPU_POWEROFF 0x040 /* CPU is powered off */ #define CPU_FROZEN 0x080 /* CPU is frozen via CPR suspend */ #define CPU_SPARE 0x100 /* CPU offline available for use */ #define CPU_FAULTED 0x200 /* CPU offline diagnosed faulty */ #define FMT_CPU_FLAGS \ "\20\12fault\11spare\10frozen" \ "\7poweroff\6offline\5enable\4exist\3quiesced\2ready\1run" #define CPU_ACTIVE(cpu) (((cpu)->cpu_flags & CPU_OFFLINE) == 0) /* * Flags for cpu_offline(), cpu_faulted(), and cpu_spare(). */ #define CPU_FORCED 0x0001 /* Force CPU offline */ /* * DTrace flags. */ #define CPU_DTRACE_NOFAULT 0x0001 /* Don't fault */ #define CPU_DTRACE_DROP 0x0002 /* Drop this ECB */ #define CPU_DTRACE_BADADDR 0x0004 /* DTrace fault: bad address */ #define CPU_DTRACE_BADALIGN 0x0008 /* DTrace fault: bad alignment */ #define CPU_DTRACE_DIVZERO 0x0010 /* DTrace fault: divide by zero */ #define CPU_DTRACE_ILLOP 0x0020 /* DTrace fault: illegal operation */ #define CPU_DTRACE_NOSCRATCH 0x0040 /* DTrace fault: out of scratch */ #define CPU_DTRACE_KPRIV 0x0080 /* DTrace fault: bad kernel access */ #define CPU_DTRACE_UPRIV 0x0100 /* DTrace fault: bad user access */ #define CPU_DTRACE_TUPOFLOW 0x0200 /* DTrace fault: tuple stack overflow */ #if defined(__sparc) #define CPU_DTRACE_FAKERESTORE 0x0400 /* pid provider hint to getreg */ #endif #define CPU_DTRACE_ENTRY 0x0800 /* pid provider hint to ustack() */ #define CPU_DTRACE_BADSTACK 0x1000 /* DTrace fault: bad stack */ #define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \ CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \ CPU_DTRACE_NOSCRATCH | CPU_DTRACE_KPRIV | \ CPU_DTRACE_UPRIV | CPU_DTRACE_TUPOFLOW | \ CPU_DTRACE_BADSTACK) #define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT | CPU_DTRACE_DROP) /* * Dispatcher flags * These flags must be changed only by the current CPU. */ #define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ #define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ #endif /* _KERNEL || _KMEMUSER */ #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) /* * Macros for manipulating sets of CPUs as a bitmap. Note that this * bitmap may vary in size depending on the maximum CPU id a specific * platform supports. This may be different than the number of CPUs * the platform supports, since CPU ids can be sparse. We define two * sets of macros; one for platforms where the maximum CPU id is less * than the number of bits in a single word (32 in a 32-bit kernel, * 64 in a 64-bit kernel), and one for platforms that require bitmaps * of more than one word. */ #define CPUSET_WORDS BT_BITOUL(NCPU) #define CPUSET_NOTINSET ((uint_t)-1) #if CPUSET_WORDS > 1 typedef struct cpuset { ulong_t cpub[CPUSET_WORDS]; } cpuset_t; /* * Private functions for manipulating cpusets that do not fit in a * single word. These should not be used directly; instead the * CPUSET_* macros should be used so the code will be portable * across different definitions of NCPU. */ extern void cpuset_all(cpuset_t *); extern void cpuset_all_but(cpuset_t *, uint_t); extern int cpuset_isnull(cpuset_t *); extern int cpuset_cmp(cpuset_t *, cpuset_t *); extern void cpuset_only(cpuset_t *, uint_t); extern uint_t cpuset_find(cpuset_t *); extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *); #define CPUSET_ALL(set) cpuset_all(&(set)) #define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu) #define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu) #define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu) #define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu) #define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu) #define CPUSET_ISNULL(set) cpuset_isnull(&(set)) #define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2)) /* * Find one CPU in the cpuset. * Sets "cpu" to the id of the found CPU, or CPUSET_NOTINSET if no cpu * could be found. (i.e. empty set) */ #define CPUSET_FIND(set, cpu) { \ cpu = cpuset_find(&(set)); \ } /* * Determine the smallest and largest CPU id in the set. Returns * CPUSET_NOTINSET in smallest and largest when set is empty. */ #define CPUSET_BOUNDS(set, smallest, largest) { \ cpuset_bounds(&(set), &(smallest), &(largest)); \ } /* * Atomic cpuset operations * These are safe to use for concurrent cpuset manipulations. * "xdel" and "xadd" are exclusive operations, that set "result" to "0" * if the add or del was successful, or "-1" if not successful. * (e.g. attempting to add a cpu to a cpuset that's already there, or * deleting a cpu that's not in the cpuset) */ #define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu)) #define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu)) #define CPUSET_ATOMIC_XADD(set, cpu, result) \ BT_ATOMIC_SET_EXCL((set).cpub, cpu, result) #define CPUSET_ATOMIC_XDEL(set, cpu, result) \ BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result) #define CPUSET_OR(set1, set2) { \ int _i; \ for (_i = 0; _i < CPUSET_WORDS; _i++) \ (set1).cpub[_i] |= (set2).cpub[_i]; \ } #define CPUSET_XOR(set1, set2) { \ int _i; \ for (_i = 0; _i < CPUSET_WORDS; _i++) \ (set1).cpub[_i] ^= (set2).cpub[_i]; \ } #define CPUSET_AND(set1, set2) { \ int _i; \ for (_i = 0; _i < CPUSET_WORDS; _i++) \ (set1).cpub[_i] &= (set2).cpub[_i]; \ } #define CPUSET_ZERO(set) { \ int _i; \ for (_i = 0; _i < CPUSET_WORDS; _i++) \ (set).cpub[_i] = 0; \ } #elif CPUSET_WORDS == 1 typedef ulong_t cpuset_t; /* a set of CPUs */ #define CPUSET(cpu) (1UL << (cpu)) #define CPUSET_ALL(set) ((void)((set) = ~0UL)) #define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu))) #define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu))) #define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu)) #define CPUSET_ADD(set, cpu) ((void)((set) |= CPUSET(cpu))) #define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu))) #define CPUSET_ISNULL(set) ((set) == 0) #define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2)) #define CPUSET_OR(set1, set2) ((void)((set1) |= (set2))) #define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2))) #define CPUSET_AND(set1, set2) ((void)((set1) &= (set2))) #define CPUSET_ZERO(set) ((void)((set) = 0)) #define CPUSET_FIND(set, cpu) { \ cpu = (uint_t)(lowbit(set) - 1); \ } #define CPUSET_BOUNDS(set, smallest, largest) { \ smallest = (uint_t)(lowbit(set) - 1); \ largest = (uint_t)(highbit(set) - 1); \ } #define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu)) #define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu)) #define CPUSET_ATOMIC_XADD(set, cpu, result) \ { result = atomic_set_long_excl(&(set), (cpu)); } #define CPUSET_ATOMIC_XDEL(set, cpu, result) \ { result = atomic_clear_long_excl(&(set), (cpu)); } #else /* CPUSET_WORDS <= 0 */ #error NCPU is undefined or invalid #endif /* CPUSET_WORDS */ extern cpuset_t cpu_seqid_inuse; #endif /* (_KERNEL || _KMEMUSER) && _MACHDEP */ #define CPU_CPR_OFFLINE 0x0 #define CPU_CPR_ONLINE 0x1 #define CPU_CPR_IS_OFFLINE(cpu) (((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) == 0) #define CPU_CPR_IS_ONLINE(cpu) ((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) #define CPU_SET_CPR_FLAGS(cpu, flag) ((cpu)->cpu_cpr_flags |= flag) #if defined(_KERNEL) || defined(_KMEMUSER) extern struct cpu *cpu[]; /* indexed by CPU number */ extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ extern cpu_t *cpu_list; /* list of CPUs */ extern cpu_t *cpu_active; /* list of active CPUs */ extern int ncpus; /* number of CPUs present */ extern int ncpus_online; /* number of CPUs not quiesced */ extern int max_ncpus; /* max present before ncpus is known */ extern int boot_max_ncpus; /* like max_ncpus but for real */ extern int boot_ncpus; /* # cpus present @ boot */ extern processorid_t max_cpuid; /* maximum CPU number */ extern struct cpu *cpu_inmotion; /* offline or partition move target */ extern cpu_t *clock_cpu_list; extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */ #if defined(__i386) || defined(__amd64) extern struct cpu *curcpup(void); #define CPU (curcpup()) /* Pointer to current CPU */ #else #define CPU (curthread->t_cpu) /* Pointer to current CPU */ #endif /* * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id * as the target and to grab cpu_lock instead of requiring the caller * to grab it. */ #define CPU_CURRENT -3 /* * Per-CPU statistics * * cpu_stats_t contains numerous system and VM-related statistics, in the form * of gauges or monotonically-increasing event occurrence counts. */ #define CPU_STATS_ENTER_K() kpreempt_disable() #define CPU_STATS_EXIT_K() kpreempt_enable() #define CPU_STATS_ADD_K(class, stat, amount) \ { kpreempt_disable(); /* keep from switching CPUs */\ CPU_STATS_ADDQ(CPU, class, stat, amount); \ kpreempt_enable(); \ } #define CPU_STATS_ADDQ(cp, class, stat, amount) { \ extern void __dtrace_probe___cpu_##class##info_##stat(uint_t, \ uint64_t *, cpu_t *); \ uint64_t *stataddr = &((cp)->cpu_stats.class.stat); \ __dtrace_probe___cpu_##class##info_##stat((amount), \ stataddr, cp); \ *(stataddr) += (amount); \ } #define CPU_STATS(cp, stat) \ ((cp)->cpu_stats.stat) /* * Increment CPU generation value. * This macro should be called whenever CPU goes on-line or off-line. * Updates to cpu_generation should be protected by cpu_lock. */ #define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++) #endif /* _KERNEL || _KMEMUSER */ /* - * CPU support routines. + * CPU support routines (not for genassym.c) */ -#if defined(_KERNEL) && defined(__STDC__) /* not for genassym.c */ +#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && defined(__STDC__) struct zone; void cpu_list_init(cpu_t *); void cpu_add_unit(cpu_t *); void cpu_del_unit(int cpuid); void cpu_add_active(cpu_t *); void cpu_kstat_init(cpu_t *); void cpu_visibility_add(cpu_t *, struct zone *); void cpu_visibility_remove(cpu_t *, struct zone *); void cpu_visibility_configure(cpu_t *, struct zone *); void cpu_visibility_unconfigure(cpu_t *, struct zone *); void cpu_visibility_online(cpu_t *, struct zone *); void cpu_visibility_offline(cpu_t *, struct zone *); void cpu_create_intrstat(cpu_t *); void cpu_delete_intrstat(cpu_t *); int cpu_kstat_intrstat_update(kstat_t *, int); void cpu_intr_swtch_enter(kthread_t *); void cpu_intr_swtch_exit(kthread_t *); void mbox_lock_init(void); /* initialize cross-call locks */ void mbox_init(int cpun); /* initialize cross-calls */ void poke_cpu(int cpun); /* interrupt another CPU (to preempt) */ /* * values for safe_list. Pause state that CPUs are in. */ #define PAUSE_IDLE 0 /* normal state */ #define PAUSE_READY 1 /* paused thread ready to spl */ #define PAUSE_WAIT 2 /* paused thread is spl-ed high */ #define PAUSE_DIE 3 /* tell pause thread to leave */ #define PAUSE_DEAD 4 /* pause thread has left */ void mach_cpu_pause(volatile char *); void pause_cpus(cpu_t *off_cp, void *(*func)(void *)); void start_cpus(void); int cpus_paused(void); void cpu_pause_init(void); cpu_t *cpu_get(processorid_t cpun); /* get the CPU struct associated */ int cpu_online(cpu_t *cp); /* take cpu online */ int cpu_offline(cpu_t *cp, int flags); /* take cpu offline */ int cpu_spare(cpu_t *cp, int flags); /* take cpu to spare */ int cpu_faulted(cpu_t *cp, int flags); /* take cpu to faulted */ int cpu_poweron(cpu_t *cp); /* take powered-off cpu to offline */ int cpu_poweroff(cpu_t *cp); /* take offline cpu to powered-off */ cpu_t *cpu_intr_next(cpu_t *cp); /* get next online CPU taking intrs */ int cpu_intr_count(cpu_t *cp); /* count # of CPUs handling intrs */ int cpu_intr_on(cpu_t *cp); /* CPU taking I/O interrupts? */ void cpu_intr_enable(cpu_t *cp); /* enable I/O interrupts */ int cpu_intr_disable(cpu_t *cp); /* disable I/O interrupts */ void cpu_intr_alloc(cpu_t *cp, int n); /* allocate interrupt threads */ /* * Routines for checking CPU states. */ int cpu_is_online(cpu_t *); /* check if CPU is online */ int cpu_is_nointr(cpu_t *); /* check if CPU can service intrs */ int cpu_is_active(cpu_t *); /* check if CPU can run threads */ int cpu_is_offline(cpu_t *); /* check if CPU is offline */ int cpu_is_poweredoff(cpu_t *); /* check if CPU is powered off */ int cpu_flagged_online(cpu_flag_t); /* flags show CPU is online */ int cpu_flagged_nointr(cpu_flag_t); /* flags show CPU not handling intrs */ int cpu_flagged_active(cpu_flag_t); /* flags show CPU scheduling threads */ int cpu_flagged_offline(cpu_flag_t); /* flags show CPU is offline */ int cpu_flagged_poweredoff(cpu_flag_t); /* flags show CPU is powered off */ /* * The processor_info(2) state of a CPU is a simplified representation suitable * for use by an application program. Kernel subsystems should utilize the * internal per-CPU state as given by the cpu_flags member of the cpu structure, * as this information may include platform- or architecture-specific state * critical to a subsystem's disposition of a particular CPU. */ void cpu_set_state(cpu_t *); /* record/timestamp current state */ int cpu_get_state(cpu_t *); /* get current cpu state */ const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */ void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */ void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */ /* frequencies */ int cpu_configure(int); int cpu_unconfigure(int); void cpu_destroy_bound_threads(cpu_t *cp); extern int cpu_bind_thread(kthread_t *tp, processorid_t bind, processorid_t *obind, int *error); extern int cpu_unbind(processorid_t cpu_id, boolean_t force); extern void thread_affinity_set(kthread_t *t, int cpu_id); extern void thread_affinity_clear(kthread_t *t); extern void affinity_set(int cpu_id); extern void affinity_clear(void); extern void init_cpu_mstate(struct cpu *, int); extern void term_cpu_mstate(struct cpu *); extern void new_cpu_mstate(int, hrtime_t); extern void get_cpu_mstate(struct cpu *, hrtime_t *); extern void thread_nomigrate(void); extern void thread_allowmigrate(void); extern void weakbinding_stop(void); extern void weakbinding_start(void); /* * The following routines affect the CPUs participation in interrupt processing, * if that is applicable on the architecture. This only affects interrupts * which aren't directed at the processor (not cross calls). * * cpu_disable_intr returns non-zero if interrupts were previously enabled. */ int cpu_disable_intr(struct cpu *cp); /* stop issuing interrupts to cpu */ void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */ /* * The mutex cpu_lock protects cpu_flags for all CPUs, as well as the ncpus * and ncpus_online counts. */ extern kmutex_t cpu_lock; /* lock protecting CPU data */ /* * CPU state change events * * Various subsystems need to know when CPUs change their state. They get this * information by registering CPU state change callbacks using * register_cpu_setup_func(). Whenever any CPU changes its state, the callback * function is called. The callback function is passed three arguments: * * Event, described by cpu_setup_t * CPU ID * Transparent pointer passed when registering the callback * * The callback function is called with cpu_lock held. The return value from the * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG * events. For these two events, non-zero return value indicates a failure and * prevents successful completion of the operation. * * New events may be added in the future. Callback functions should ignore any * events that they do not understand. * * The following events provide notification callbacks: * * CPU_INIT A new CPU is started and added to the list of active CPUs * This event is only used during boot * * CPU_CONFIG A newly inserted CPU is prepared for starting running code * This event is called by DR code * * CPU_UNCONFIG CPU has been powered off and needs cleanup * This event is called by DR code * * CPU_ON CPU is enabled but does not run anything yet * * CPU_INTR_ON CPU is enabled and has interrupts enabled * * CPU_OFF CPU is going offline but can still run threads * * CPU_CPUPART_OUT CPU is going to move out of its partition * * CPU_CPUPART_IN CPU is going to move to a new partition * * CPU_SETUP CPU is set up during boot and can run threads */ typedef enum { CPU_INIT, CPU_CONFIG, CPU_UNCONFIG, CPU_ON, CPU_OFF, CPU_CPUPART_IN, CPU_CPUPART_OUT, CPU_SETUP, CPU_INTR_ON } cpu_setup_t; typedef int cpu_setup_func_t(cpu_setup_t, int, void *); /* * Routines used to register interest in cpu's being added to or removed * from the system. */ extern void register_cpu_setup_func(cpu_setup_func_t *, void *); extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); extern void cpu_state_change_notify(int, cpu_setup_t); /* * Call specified function on the given CPU */ typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t); extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t); /* * Create various strings that describe the given CPU for the * processor_info system call and configuration-related kstats. */ #define CPU_IDSTRLEN 100 extern void init_cpu_info(struct cpu *); extern void populate_idstr(struct cpu *); extern void cpu_vm_data_init(struct cpu *); extern void cpu_vm_data_destroy(struct cpu *); -#endif /* _KERNEL */ +#endif /* _KERNEL || _FAKE_KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_CPUVAR_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h (revision 332526) @@ -1,101 +1,102 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. */ #ifndef _SYS_FM_UTIL_H #define _SYS_FM_UTIL_H #ifdef __cplusplus extern "C" { #endif #include #include /* * Shared user/kernel definitions for class length, error channel name, * and kernel event publisher string. */ #define FM_MAX_CLASS 100 #define FM_ERROR_CHAN "com.sun:fm:error" #define FM_PUB "fm" /* * ereport dump device transport support * * Ereports are written out to the dump device at a proscribed offset from the * end, similar to in-transit log messages. The ereports are represented as a * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. * * NOTE: All of these constants and the header must be defined so they have the * same representation for *both* 32-bit and 64-bit producers and consumers. */ #define ERPT_MAGIC 0xf00d4eddU #define ERPT_MAX_ERRS 16 #define ERPT_DATA_SZ (6 * 1024) #define ERPT_EVCH_MAX 256 #define ERPT_HIWAT 64 typedef struct erpt_dump { uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ uint32_t ed_chksum; /* checksum32() of packed nvlist data */ uint32_t ed_size; /* ereport (nvl) fixed buf size */ uint32_t ed_pad; /* reserved for future use */ hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ struct { uint64_t sec; /* seconds since gettimeofday() Epoch */ uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ } ed_tod_base; } erpt_dump_t; -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_FAKE_KERNEL) #include #define FM_STK_DEPTH 20 /* maximum stack depth */ #define FM_SYM_SZ 64 /* maximum symbol size */ #define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */ #define FM_EREPORT_PAYLOAD_NAME_STACK "stack" extern errorq_t *ereport_errorq; extern void *ereport_dumpbuf; extern size_t ereport_dumplen; extern void fm_init(void); extern void fm_nvprint(nvlist_t *); #define fm_panic panic extern void fm_banner(void); extern void fm_ereport_dump(void); extern void fm_ereport_post(nvlist_t *, int); extern int is_fm_panic(); -#endif /* _KERNEL */ +#endif /* _KERNEL || _FAKE_KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_FM_UTIL_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h (revision 332525) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h (revision 332526) @@ -1,426 +1,427 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #ifndef _SYS_VNODE_H #define _SYS_VNODE_H #include_next #define IS_DEVVP(vp) \ ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) #define V_XATTRDIR 0x0000 /* attribute unnamed directory */ #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ /* * Structure of all optional attributes. */ typedef struct xoptattr { timestruc_t xoa_createtime; /* Create time of file */ uint8_t xoa_archive; uint8_t xoa_system; uint8_t xoa_readonly; uint8_t xoa_hidden; uint8_t xoa_nounlink; uint8_t xoa_immutable; uint8_t xoa_appendonly; uint8_t xoa_nodump; uint8_t xoa_opaque; uint8_t xoa_av_quarantined; uint8_t xoa_av_modified; uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; uint8_t xoa_reparse; uint64_t xoa_generation; uint8_t xoa_offline; uint8_t xoa_sparse; } xoptattr_t; /* * The xvattr structure is really a variable length structure that * is made up of: * - The classic vattr_t (xva_vattr) * - a 32 bit quantity (xva_mapsize) that specifies the size of the * attribute bitmaps in 32 bit words. * - A pointer to the returned attribute bitmap (needed because the * previous element, the requested attribute bitmap) is variable lenth. * - The requested attribute bitmap, which is an array of 32 bit words. * Callers use the XVA_SET_REQ() macro to set the bits corresponding to * the attributes that are being requested. * - The returned attribute bitmap, which is an array of 32 bit words. * File systems that support optional attributes use the XVA_SET_RTN() * macro to set the bits corresponding to the attributes that are being * returned. * - The xoptattr_t structure which contains the attribute values * * xva_mapsize determines how many words in the attribute bitmaps. * Immediately following the attribute bitmaps is the xoptattr_t. * xva_getxoptattr() is used to get the pointer to the xoptattr_t * section. */ #define XVA_MAPSIZE 3 /* Size of attr bitmaps */ #define XVA_MAGIC 0x78766174 /* Magic # for verification */ /* * The xvattr structure is an extensible structure which permits optional * attributes to be requested/returned. File systems may or may not support * optional attributes. They do so at their own discretion but if they do * support optional attributes, they must register the VFSFT_XVATTR feature * so that the optional attributes can be set/retrived. * * The fields of the xvattr structure are: * * xva_vattr - The first element of an xvattr is a legacy vattr structure * which includes the common attributes. If AT_XVATTR is set in the va_mask * then the entire structure is treated as an xvattr. If AT_XVATTR is not * set, then only the xva_vattr structure can be used. * * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. * * xva_mapsize - Size of requested and returned attribute bitmaps. * * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the * size of the array before it, xva_reqattrmap[], could change which means * the location of xva_rtnattrmap[] could change. This will allow unbundled * file systems to find the location of xva_rtnattrmap[] when the sizes change. * * xva_reqattrmap[] - Array of requested attributes. Attributes are * represented by a specific bit in a specific element of the attribute * map array. Callers set the bits corresponding to the attributes * that the caller wants to get/set. * * xva_rtnattrmap[] - Array of attributes that the file system was able to * process. Not all file systems support all optional attributes. This map * informs the caller which attributes the underlying file system was able * to set/get. (Same structure as the requested attributes array in terms * of each attribute corresponding to specific bits and array elements.) * * xva_xoptattrs - Structure containing values of optional attributes. * These values are only valid if the corresponding bits in xva_reqattrmap * are set and the underlying file system supports those attributes. */ typedef struct xvattr { vattr_t xva_vattr; /* Embedded vattr structure */ uint32_t xva_magic; /* Magic Number */ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ xoptattr_t xva_xoptattrs; /* Optional attributes */ } xvattr_t; /* * Attributes of interest to the caller of setattr or getattr. */ #define AT_TYPE 0x00001 #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 #define AT_FSID 0x00010 #define AT_NODEID 0x00020 #define AT_NLINK 0x00040 #define AT_SIZE 0x00080 #define AT_ATIME 0x00100 #define AT_MTIME 0x00200 #define AT_CTIME 0x00400 #define AT_RDEV 0x00800 #define AT_BLKSIZE 0x01000 #define AT_NBLOCKS 0x02000 /* 0x04000 */ /* unused */ #define AT_SEQ 0x08000 /* * If AT_XVATTR is set then there are additional bits to process in * the xvattr_t's attribute bitmap. If this is not set then the bitmap * MUST be ignored. Note that this bit must be set/cleared explicitly. * That is, setting AT_ALL will NOT set AT_XVATTR. */ #define AT_XVATTR 0x10000 #define AT_ALL (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\ AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) #define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE) #define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) #define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) /* * Attribute bits used in the extensible attribute's (xva's) attribute * bitmaps. Note that the bitmaps are made up of a variable length number * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" * is the element in the bitmap (starting at 1). This convention is for * the convenience of the maintainer to keep track of which element each * attribute belongs to. * * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS * MUST USE THE XAT_* DEFINES. */ #define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ #define XAT0_CREATETIME 0x00000001 /* Create time of file */ #define XAT0_ARCHIVE 0x00000002 /* Archive */ #define XAT0_SYSTEM 0x00000004 /* System */ #define XAT0_READONLY 0x00000008 /* Readonly */ #define XAT0_HIDDEN 0x00000010 /* Hidden */ #define XAT0_NOUNLINK 0x00000020 /* Nounlink */ #define XAT0_IMMUTABLE 0x00000040 /* immutable */ #define XAT0_APPENDONLY 0x00000080 /* appendonly */ #define XAT0_NODUMP 0x00000100 /* nodump */ #define XAT0_OPAQUE 0x00000200 /* opaque */ #define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ #define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ #define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ #define XAT0_REPARSE 0x00002000 /* FS reparse point */ #define XAT0_GEN 0x00004000 /* object generation number */ #define XAT0_OFFLINE 0x00008000 /* offline */ #define XAT0_SPARSE 0x00010000 /* sparse */ #define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \ XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \ XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \ XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE) /* Support for XAT_* optional attributes */ #define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ #define XVA_SHFT 32 /* Used to shift index */ /* * Used to pry out the index and attribute bits from the XAT_* attributes * defined below. Note that we're masking things down to 32 bits then * casting to uint32_t. */ #define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) #define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) /* * The following defines present a "flat namespace" so that consumers don't * need to keep track of which element belongs to which bitmap entry. * * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER */ #define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) #define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) #define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) #define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) #define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) #define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) #define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) #define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) #define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) #define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) #define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) #define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) #define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) #define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) #define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) #define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) #define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) /* * The returned attribute map array (xva_rtnattrmap[]) is located past the * requested attribute map array (xva_reqattrmap[]). Its location changes * when the array sizes change. We use a separate pointer in a known location * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is * set in xva_init() */ #define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) /* * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap * of requested attributes (xva_reqattrmap[]). */ #define XVA_SET_REQ(xvap, attr) { \ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \ } /* * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap * of requested attributes (xva_reqattrmap[]). */ #define XVA_CLR_REQ(xvap, attr) { \ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \ } /* * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap * of returned attributes (xva_rtnattrmap[]). */ #define XVA_SET_RTN(xvap, attr) { \ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \ } /* * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[]) * to see of the corresponding attribute bit is set. If so, returns non-zero. */ #define XVA_ISSET_REQ(xvap, attr) \ ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ ((xvap)->xva_magic == XVA_MAGIC) && \ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) /* * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[]) * to see of the corresponding attribute bit is set. If so, returns non-zero. */ #define XVA_ISSET_RTN(xvap, attr) \ ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ ((xvap)->xva_magic == XVA_MAGIC) && \ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) #define MODEMASK 07777 /* mode bits plus permission bits */ #define PERMMASK 00777 /* permission bits */ /* * VOP_ACCESS flags */ #define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ /* * Flags for vnode operations. */ enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ /* * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations */ typedef struct vsecattr { uint_t vsa_mask; /* See below */ int vsa_aclcnt; /* ACL entry count */ void *vsa_aclentp; /* pointer to ACL entries */ int vsa_dfaclcnt; /* default ACL entry count */ void *vsa_dfaclentp; /* pointer to default ACL entries */ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ uint_t vsa_aclflags; /* ACE ACL flags */ } vsecattr_t; /* vsa_mask values */ #define VSA_ACL 0x0001 #define VSA_ACLCNT 0x0002 #define VSA_DFACL 0x0004 #define VSA_DFACLCNT 0x0008 #define VSA_ACE 0x0010 #define VSA_ACECNT 0x0020 #define VSA_ACE_ALLTYPES 0x0040 #define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ /* * Structure used by various vnode operations to determine * the context (pid, host, identity) of a caller. * * The cc_caller_id is used to identify one or more callers who invoke * operations, possibly on behalf of others. For example, the NFS * server could have it's own cc_caller_id which can be detected by * vnode/vfs operations or (FEM) monitors on those operations. New * caller IDs are generated by fs_new_caller_id(). */ typedef struct caller_context { pid_t cc_pid; /* Process ID of the caller */ int cc_sysid; /* System ID, used for remote calls */ u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */ ulong_t cc_flags; } caller_context_t; struct taskq; /* * Flags for VOP_LOOKUP * * Defined in file.h, but also possible, FIGNORECASE and FSEARCH * */ #define LOOKUP_DIR 0x01 /* want parent dir vp */ #define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ #define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ #define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ /* * Flags for VOP_READDIR */ #define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ #define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ /* * Public vnode manipulation functions. */ #ifdef _KERNEL void vn_rele_async(struct vnode *vp, struct taskq *taskq); /* * Extensible vnode attribute (xva) routines: * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR) * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t */ void xva_init(xvattr_t *); xoptattr_t *xva_getxoptattr(xvattr_t *); /* Get ptr to xoptattr_t */ #define VN_RELE_ASYNC(vp, taskq) { \ vn_rele_async(vp, taskq); \ } #endif /* _KERNEL */ /* * Flags to VOP_SETATTR/VOP_GETATTR. */ #define ATTR_UTIME 0x01 /* non-default utime(2) request */ #define ATTR_EXEC 0x02 /* invocation from exec(2) */ #define ATTR_COMM 0x04 /* yield common vp attributes */ #define ATTR_HINT 0x08 /* information returned will be `hint' */ #define ATTR_REAL 0x10 /* yield attributes of the real vp */ #define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */ #define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */ #ifdef __cplusplus } #endif #endif /* _SYS_VNODE_H */ Index: stable/11 =================================================================== --- stable/11 (revision 332525) +++ stable/11 (revision 332526) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r329755