Index: projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb/zdb.c =================================================================== --- projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 326161) +++ projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 326162) @@ -1,4158 +1,4161 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef verify #include #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ zio_compress_table[(idx)].ci_name : "UNKNOWN") #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ (((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ? \ DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES)) #ifndef lint extern int reference_tracking_enable; extern boolean_t zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; #else int reference_tracking_enable; boolean_t zfs_recover; uint64_t zfs_arc_max, zfs_arc_meta_limit; int zfs_vdev_async_read_max_active; #endif const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); extern void dump_intent_log(zilog_t *); static uint64_t *zopt_object = NULL; static int zopt_objects = 0; static libzfs_handle_t *g_zfs; static uint64_t max_inflight = 1000; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init() { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[ [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] " "[ ...]\n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers are specified, only " "those objects are dumped\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -E decode and display block from an " "embedded block pointer\n"); (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -O perform object lookups by path\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " "others)\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " "safe range of transaction groups\n"); (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " "exiting\n"); (void) fprintf(stderr, " -I -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, " -o = set global " "variable to an unsigned 32-bit integer value\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -q don't print label contents\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -U -- use alternate " "cachefile\n"); (void) fprintf(stderr, " -V do verbatim import\n"); (void) fprintf(stderr, " -x -- " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X attempt extreme rewind (does not " "work with dataset)\n\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } static void dump_debug_buffer() { if (dump_opt['G']) { (void) printf("\n"); zfs_dbgmsg_print("zdb"); } } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); exit(1); } /* ARGSUSED */ static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } /* ARGSUSED */ static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, sizeof (buf)); } const char histo_stars[] = "****************************************"; const int histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) maxidx = i; if (histo[i] > 0 && i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) printf("\tUNKNOWN OBJECT TYPE\n"); } /*ARGSUSED*/ void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; void *prop; int i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { (void) printf("%s", (char *)prop); } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= sizeof (*bpop)) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (dump_opt['d'] < 5) return; for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); (void) printf("\t%s\n", blkbuf); } } /* ARGSUSED */ static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (int64_t i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } /*ARGSUSED*/ static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; int i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (int c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && !vd->vdev_removing) { for (int m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (int c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { uint64_t alloc, offset, entry; char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; /* * Print out the freelist entries in both encoded and decoded form. */ alloc = 0; for (offset = 0; offset < space_map_length(sm); offset += sizeof (entry)) { uint8_t mapshift = sm->sm_shift; VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (entry)), ddata[SM_DEBUG_ACTION_DECODE(entry)], (u_longlong_t)SM_DEBUG_TXG_DECODE(entry), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry)); } else { (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx\n", (u_longlong_t)(offset / sizeof (entry)), SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', (u_longlong_t)((SM_OFFSET_DECODE(entry) << mapshift) + sm->sm_start), (u_longlong_t)((SM_OFFSET_DECODE(entry) << mapshift) + sm->sm_start + (SM_RUN_DECODE(entry) << mapshift)), (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift)); if (SM_TYPE_DECODE(entry) == SM_ALLOC) alloc += SM_RUN_DECODE(entry) << mapshift; else alloc -= SM_RUN_DECODE(entry) << mapshift; } } if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%llu) INCONSISTENT " "with space map summary (%llu)\n", (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_tree; avl_tree_t *t = &msp->ms_size_tree; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); metaslab_load_wait(msp); if (!msp->ms_loaded) { VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_tree); } dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); mutex_enter(&msp->ms_lock); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); mutex_exit(&msp->ms_lock); } } static void print_vdev_metaslab_header(vdev_t *vd) { (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", (u_longlong_t)vd->vdev_id, "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %10s\n", "---------------", "-------------------", "---------------", "-------------"); } static void dump_metaslab_groups(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_objects > 0) { c = zopt_object[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_objects > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_objects; m++) { if (zopt_object[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_object[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_object[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &dde->dde_key; char *types[4] = { "ditto", "single", "double", "triple" }; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { char name[DDT_NAMELEN]; ddt_entry_t dde; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)(dspace / count), (u_longlong_t)(mspace / count)); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT(error == ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total = { 0 }; ddt_stat_t dds_total = { 0 }; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); mutex_enter(rt->rt_lock); range_tree_walk(rt, dump_dtl_seg, prefix); mutex_exit(rt->rt_lock); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (int c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } /* from spa_history.c: spa_history_create_obj() */ #define HIS_BUF_LEN_DEF (128 << 10) #define HIS_BUF_LEN_MAX (1 << 30) static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf = NULL; uint64_t bufsize = HIS_BUF_LEN_DEF; uint64_t resid, len, off = 0; uint_t num = 0; int error; time_t tsec; struct tm t; char tbuf[30]; char internalstr[MAXPATHLEN]; if ((buf = malloc(bufsize)) == NULL) (void) fprintf(stderr, "Unable to read history: " "out of memory\n"); do { len = bufsize; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; /* * If the history block is too big, double the buffer * size and try again. */ if (resid == len) { free(buf); buf = NULL; bufsize <<= 1; if ((bufsize >= HIS_BUF_LEN_MAX) || ((buf = malloc(bufsize)) == NULL)) { (void) fprintf(stderr, "Unable to read history: " "out of memory\n"); return; } } } while (len != 0); free(buf); (void) printf("\nHistory:\n"); for (int i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%lld] %s", zfs_history_event_names[ievent], txg, intstr); cmd = internalstr; } tsec = time; (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } } /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)bp->blk_birth); return; } blkbuf[0] = '\0'; for (int i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)bp->blk_birth); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); } } static void print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (bp->blk_birth == 0) return (0); print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } /*ARGSUSED*/ static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } /*ARGSUSED*/ static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO } /*ARGSUSED*/ static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } /* ARGSUSED */ static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } /* ARGSUSED */ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf(" %*s: object %llu, %llu local blkptrs, " "%llu subobjs in object %llu, %s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static void dump_deadlist(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; uint64_t unused; char bytes[32]; char comp[32]; char uncomp[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); (void) printf("\n Deadlist: %s (%s/%s comp)\n", bytes, comp, uncomp); if (dump_opt['d'] < 4) return; (void) printf("\n"); /* force the tree to be loaded */ dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } } } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); err = dmu_objset_own(path, type, B_TRUE, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, strerror(err)); return (err); } if (dmu_objset_type(*osp) == DMU_OST_ZFS) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dmu_objset_disown(*osp, tag); *osp = NULL; } } sa_os = *osp; return (0); } static void close_objset(objset_t *os, void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dmu_objset_disown(os, tag); sa_attr_table = NULL; sa_os = NULL; } static void fuid_table_destroy() { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { char *domain; domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error != 0) { (void) snprintf(path, sizeof (path), "\?\?\?", (u_longlong_t)object); } (void) printf("\tpath %s\n", path); } dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); sa_handle_destroy(hdl); } /*ARGSUSED*/ static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group used */ dump_zap, /* ZFS user/group quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static void dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } dmu_object_info_from_dnode(dn, &doi); zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", bonus_size, "bonus", ZDB_OT_NAME(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); *print_header = 1; } if (verbosity >= 5) dump_indirect(dn); if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } if (db != NULL) dmu_buf_rele(db, FTAG); } static char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; static void dump_dir(objset_t *os) { dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; char *type = "UNKNOWN"; int verbosity = dump_opt['d']; int print_header = 1; int i, error; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf); if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, &print_header); (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) dump_deadlist(&dmu_objset_ds(os)->ds_deadlist); if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header); object_count++; } ASSERT3U(object_count, ==, usedobjs); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf(header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf(footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) fprintf(stderr, "cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) { vdev_t vd; vdev_t *vdp = &vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vdp->vdev_top = vdp; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); uberblock_t *ub = (void *)((char *)lbl + uoff); if (uberblock_verify(ub)) continue; (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, "Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name) { int err; int header = 1; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: dump_object(os, child_obj, dump_opt['v'], &header); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); dmu_objset_disown(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path); close_objset(os, FTAG); return (err); } static int dump_label(const char *dev) { int fd; vdev_label_t label; char path[MAXPATHLEN]; char *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; uint64_t psize, ashift; boolean_t label_found = B_FALSE; (void) strlcpy(path, dev, sizeof (path)); if (dev[0] == '/') { if (strncmp(dev, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) { (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD)); } } else if (stat64(path, &statbuf) != 0) { char *s; (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, dev); if (((s = strrchr(dev, 's')) == NULL && (s = strchr(dev, 'p')) == NULL) || !isdigit(*(s + 1))) (void) strlcat(path, "s0", sizeof (path)); } if ((fd = open64(path, O_RDONLY)) < 0) { (void) fprintf(stderr, "cannot open '%s': %s\n", path, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) fprintf(stderr, "failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); exit(1); } if (S_ISBLK(statbuf.st_mode)) { (void) fprintf(stderr, "cannot use '%s': character device required\n", path); (void) close(fd); exit(1); } psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *config = NULL; if (!dump_opt['q']) { (void) printf("------------------------------------\n"); (void) printf("LABEL %d\n", l); (void) printf("------------------------------------\n"); } if (pread64(fd, &label, sizeof (label), vdev_label_offset(psize, l, 0)) != sizeof (label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); continue; } if (nvlist_unpack(buf, buflen, &config, 0) != 0) { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); ashift = SPA_MINBLOCKSHIFT; } else { nvlist_t *vdev_tree = NULL; if (!dump_opt['q']) dump_nvlist(config, 4); if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; nvlist_free(config); label_found = B_TRUE; } if (dump_opt['u']) dump_label_uberblocks(&label, ashift); } (void) close(fd); return (label_found ? 0 : 2); } static uint64_t dataset_feature_count[SPA_FEATURES]; /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) { int error; objset_t *os; error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); if (error != 0) return (0); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!dmu_objset_ds(os)->ds_feature_inuse[f]) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } dump_dir(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE]; uint64_t zcb_start; uint64_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; } zdb_cb_t; static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) zb->zb_ditto_samevdev++; break; } } if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } if (dump_opt['L']) return; if (BP_GET_DEDUP(bp)) { ddt_t *ddt; ddt_entry_t *dde; ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); refcnt = ddp->ddp_refcnt; if (ddt_phys_total_refcnt(dde) == 0) ddt_remove(ddt, dde); } ddt_exit(ddt); } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_first_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } /* ARGSUSED */ static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); } /* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (bp == NULL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; int kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); int sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4dMB/s) " "estimated time remaining: %uhr %02umin %02usec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { ddt_bookmark_t ddb = { 0 }; ddt_entry_t dde; int error; while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; if (ddb.ddb_class == DDT_CLASS_UNIQUE) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); zcb->zcb_dedup_blocks++; } } if (!dump_opt['L']) { ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); } } ASSERT(error == ENOENT); } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; /* * We are going to be changing the meaning of the metaslab's * ms_tree. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; metaslab_group_t *mg = vd->vdev_mg; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(msp->ms_group, ==, mg); mutex_enter(&msp->ms_lock); metaslab_unload(msp); /* * For leak detection, we overload the metaslab * ms_tree to contain allocated segments * instead of free segments. As a result, * we can't use the normal metaslab_load/unload * interfaces. */ if (msp->ms_sm != NULL) { (void) fprintf(stderr, "\rloading space map for " "vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)c, (longlong_t)rvd->vdev_children, (longlong_t)m, (longlong_t)vd->vdev_ms_count); /* * We don't want to spend the CPU * manipulating the size-ordered * tree, so clear the range_tree * ops. */ msp->ms_tree->rt_ops = NULL; VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); if (!msp->ms_loaded) { msp->ms_loaded = B_TRUE; } } mutex_exit(&msp->ms_lock); } } (void) fprintf(stderr, "\n"); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } static void zdb_leak_fini(spa_t *spa) { if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; metaslab_group_t *mg = vd->vdev_mg; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(mg, ==, msp->ms_group); mutex_enter(&msp->ms_lock); /* * The ms_tree has been overloaded to * contain allocated segments. Now that we * finished traversing all blocks, any * block that remains in the ms_tree * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_tree. */ range_tree_vacate(msp->ms_tree, zdb_leak, vd); if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } mutex_exit(&msp->ms_lock); } } } } /* ARGSUSED */ static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb = { 0 }; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; boolean_t leaks = B_FALSE; (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * Load all space maps as SM_ALLOC maps, then traverse the pool * claiming each block we discover. If the pool is perfectly * consistent, the space maps will be empty when we're done. * Anything left over is a leak; any block we can't claim (because * it's not part of any space map) is a double allocation, * reference to a freed block, or an unclaimed log block. */ zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (int i = 0; i < max_ncpus; i++) { (void) zio_wait(spa->spa_async_zio_root[i]); spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (int e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); } } } /* * Report any leaked segments. */ zdb_leak_fini(spa); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize; if (total_found == total_alloc) { if (!dump_opt['L']) (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); leaks = B_TRUE; } if (tzb->zb_count == 0) return (2); (void) printf("\n"); (void) printf("\tbp count: %10llu\n", (u_longlong_t)tzb->zb_count); (void) printf("\tganged count: %10llu\n", (longlong_t)tzb->zb_gangs); (void) printf("\tbp logical: %10llu avg: %6llu\n", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\tbp physical: %10llu avg:" " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\tbp allocated: %10llu avg:" " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\tbp deduped: %10llu ref>1:" " %6llu deduplication: %6.2f\n", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb.zcb_embedded_histogram[i], sizeof (zcb.zcb_embedded_histogram[i]) / sizeof (zcb.zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; char *typename; /* make sure nicenum has enough space */ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb.zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } } (void) printf("\n"); if (leaks) return (2); if (zcb.zcb_haderrors) return (3); return (0); } typedef struct zdb_ddt_entry { ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total = { 0 }; ddt_stat_t dds_total = { 0 }; avl_create(&t, ddt_entry_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; dds.dds_psize = zdde->zdde_ref_psize / refcnt; dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], &dds, 0); umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_stat(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj( &spa->spa_dsl_pool->dp_free_bpobj, "Pool snapshot frees", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { ASSERT0(dataset_feature_count[f]); continue; } (void) feature_get_refcount(spa, &spa_feature_table[f], &refcount); if (dataset_feature_count[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld datasets != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)dataset_feature_count[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc != 0) { dump_debug_buffer(); exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_PHYS 0x0020 #define ZDB_FLAG_RAW 0x0040 #define ZDB_FLAG_PRINT_BLKPTR 0x0080 int flagbits[256]; static void zdb_print_blkptr(blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); (void) write(1, buf, size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; int nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); int i, j; char *hdr, *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the heirarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, char *path) { char *s, *p, *q; int i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = (int)strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i < 0 || i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (*s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } /* ARGSUSED */ static int random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) { return (random_get_pseudo_bytes(buf, len)); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:size[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * *c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * p: Do I/O to physical offset * r: Dump raw data to stdout * * * = not yet implemented */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; dup = strdup(thing); s = strtok(dup, ":"); vdev = s ? s : ""; s = strtok(NULL, ":"); offset = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); size = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); flagstr = s ? s : ""; s = NULL; if (size == 0) s = "size must not be zero"; if (!IS_P2ALIGNED(size, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); free(dup); return; } for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { for (i = 0; flagstr[i]; i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Invalid flag: %c\n", flagstr[i]); continue; } flags |= bit; /* If it's not something with an argument, keep going */ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_PRINT_BLKPTR)) == 0) continue; p = &flagstr[i + 1]; if (bit == ZDB_FLAG_PRINT_BLKPTR) blkptr_offset = strtoull(p, &p, 16); if (*p != ':' && *p != '\0') { (void) printf("***Invalid flag arg: '%s'\n", s); free(dup); return; } i += p - &flagstr[i + 1]; /* skip over the number */ } } vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); free(dup); return; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } psize = size; lsize = size; pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } if (flags & ZDB_FLAG_DECOMPRESS) { /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ enum zio_compress c; void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); abd_copy_to_buf(pbuf2, pabd, psize); VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, random_get_pseudo_bytes_cb, NULL)); VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, SPA_MAXBLOCKSIZE - psize)); for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; lsize -= SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { if (zio_decompress_data(c, pabd, lbuf, psize, lsize) == 0 && zio_decompress_data_buf(c, pbuf2, lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } if (c != ZIO_COMPRESS_FUNCTIONS) break; lsize -= SPA_MINBLOCKSIZE; } umem_free(pbuf2, SPA_MAXBLOCKSIZE); umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (lsize <= psize) { (void) printf("Decompress of %s failed\n", thing); goto out; } buf = lbuf; size = lsize; } else { buf = abd_to_buf(pabd); size = psize; } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, size, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, size, flags); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp = { 0 }; unsigned long long *words = (void *)&bp; - char buf[SPA_MAXBLOCKSIZE]; + char *buf; int err; err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) printf("invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); + buf = malloc(SPA_MAXBLOCKSIZE); err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) printf("decode failed: %u\n", err); + free(buf); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + free(buf); } static boolean_t pool_match(nvlist_t *cfg, char *tgt) { uint64_t v, guid = strtoull(tgt, NULL, 0); char *s; if (guid != 0) { if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) return (v == guid); } else { if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) return (strcmp(s, tgt) == 0); } return (B_FALSE); } static char * find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv) { nvlist_t *pools; nvlist_t *match = NULL; char *name = NULL; char *sepp = NULL; char sep = '\0'; int count = 0; importargs_t args = { 0 }; args.paths = dirc; args.path = dirv; args.can_be_active = B_TRUE; if ((sepp = strpbrk(*target, "/@")) != NULL) { sep = *sepp; *sepp = '\0'; } pools = zpool_search_import(g_zfs, &args); if (pools != NULL) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, configp) == 0); if (pool_match(*configp, *target)) { count++; if (match != NULL) { /* print previously found config */ if (name != NULL) { (void) printf("%s\n", name); dump_nvlist(match, 8); name = NULL; } (void) printf("%s\n", nvpair_name(elem)); dump_nvlist(*configp, 8); } else { match = *configp; name = nvpair_name(elem); } } } } if (count > 1) (void) fatal("\tMatched %d pools - use pool GUID " "instead of pool name or \n" "\tpool name part of a dataset name to select pool", count); if (sepp) *sepp = sep; /* * If pool GUID was specified for pool id, replace it with pool name */ if (name && (strstr(*target, name) != *target)) { int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0); *target = umem_alloc(sz, UMEM_NOFAIL); (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : ""); } *configp = name ? match : NULL; return (name); } int main(int argc, char **argv) { int i, c; struct rlimit rl = { 1024, 1024 }; spa_t *spa = NULL; objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env; boolean_t target_is_spa = B_TRUE; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { switch (c) { case 'b': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'O': case 'R': case 's': case 'S': case 'u': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; /* NB: Sort single match options below. */ case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { (void) fprintf(stderr, "maximum number " "of inflight I/Os must be greater " "than 0\n"); usage(); } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; kernel_init(FREAD); g_zfs = libzfs_init(); if (g_zfs == NULL) fatal("Fail to initialize zfs"); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("AeEFlLOPRSX", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); return (0); } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } usage(); } if (dump_opt['l']) return (dump_label(argv[0])); if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; return (dump_path(argv[0], argv[1])); } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; target = argv[0]; if (dump_opt['e']) { nvlist_t *cfg = NULL; char *name = find_zpool(&target, &cfg, nsearch, searchdirs); error = ENOENT; if (name) { if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } if (nvlist_add_nvlist(cfg, ZPOOL_REWIND_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } error = spa_import(name, cfg, NULL, flags); } } if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_is_spa = B_FALSE; /* * Remove any trailing slash. Later code would get confused * by it, but we want to allow it so that "pool/" can * indicate that we want to dump the topmost filesystem, * rather than the whole pool. */ targetlen = strlen(target); if (targetlen != 0 && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; } if (error == 0) { if (target_is_spa || dump_opt['R']) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else { error = open_objset(target, DMU_OST_ANY, FTAG, &os); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); argv++; argc--; if (!dump_opt['R']) { if (argc > 0) { zopt_objects = argc; zopt_object = calloc(zopt_objects, sizeof (uint64_t)); for (i = 0; i < zopt_objects; i++) { errno = 0; zopt_object[i] = strtoull(argv[i], NULL, 0); if (zopt_object[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (os != NULL) { dump_dir(os); } else if (zopt_objects > 0 && !dump_opt['m']) { dump_dir(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['p'] = ZDB_FLAG_PHYS; flagbits['r'] = ZDB_FLAG_RAW; for (i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (os != NULL) close_objset(os, FTAG); else spa_close(spa, FTAG); fuid_table_destroy(); dump_debug_buffer(); libzfs_fini(g_zfs); kernel_fini(); return (0); } Index: projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb =================================================================== --- projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb (revision 326161) +++ projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb (revision 326162) Property changes on: projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris/cmd/zdb:r325505-326161 Index: projects/bsd_rdma_4_9/cddl/contrib/opensolaris =================================================================== --- projects/bsd_rdma_4_9/cddl/contrib/opensolaris (revision 326161) +++ projects/bsd_rdma_4_9/cddl/contrib/opensolaris (revision 326162) Property changes on: projects/bsd_rdma_4_9/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris:r326132-326161 Index: projects/bsd_rdma_4_9/cddl =================================================================== --- projects/bsd_rdma_4_9/cddl (revision 326161) +++ projects/bsd_rdma_4_9/cddl (revision 326162) Property changes on: projects/bsd_rdma_4_9/cddl ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl:r326132-326161 Index: projects/bsd_rdma_4_9/contrib/binutils/bfd/ihex.c =================================================================== --- projects/bsd_rdma_4_9/contrib/binutils/bfd/ihex.c (revision 326161) +++ projects/bsd_rdma_4_9/contrib/binutils/bfd/ihex.c (revision 326162) @@ -1,993 +1,993 @@ /* BFD back-end for Intel Hex objects. Copyright 1995, 1996, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. Written by Ian Lance Taylor of Cygnus Support . This file is part of BFD, the Binary File Descriptor library. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ /* This is what Intel Hex files look like: 1. INTEL FORMATS A. Intel 1 16-bit address-field format, for files 64k bytes in length or less. DATA RECORD Byte 1 Header = colon(:) 2..3 The number of data bytes in hex notation 4..5 High byte of the record load address 6..7 Low byte of the record load address 8..9 Record type, must be "00" 10..x Data bytes in hex notation: x = (number of bytes - 1) * 2 + 11 x+1..x+2 Checksum in hex notation x+3..x+4 Carriage return, line feed END RECORD Byte 1 Header = colon (:) 2..3 The byte count, must be "00" 4..7 Transfer-address (usually "0000") the jump-to address, execution start address 8..9 Record type, must be "01" 10..11 Checksum, in hex notation 12..13 Carriage return, line feed B. INTEL 2 MCS-86 format, using a 20-bit address for files larger than 64K bytes. DATA RECORD Byte 1 Header = colon (:) 2..3 The byte count of this record, hex notation 4..5 High byte of the record load address 6..7 Low byte of the record load address 8..9 Record type, must be "00" 10..x The data bytes in hex notation: x = (number of data bytes - 1) * 2 + 11 x+1..x+2 Checksum in hex notation x+3..x+4 Carriage return, line feed EXTENDED ADDRESS RECORD Byte 1 Header = colon(:) 2..3 The byte count, must be "02" 4..7 Load address, must be "0000" 8..9 Record type, must be "02" 10..11 High byte of the offset address 12..13 Low byte of the offset address 14..15 Checksum in hex notation 16..17 Carriage return, line feed The checksums are the two's complement of the 8-bit sum without carry of the byte count, offset address, and the record type. START ADDRESS RECORD Byte 1 Header = colon (:) 2..3 The byte count, must be "04" 4..7 Load address, must be "0000" 8..9 Record type, must be "03" 10..13 8086 CS value 14..17 8086 IP value 18..19 Checksum in hex notation 20..21 Carriage return, line feed Another document reports these additional types: EXTENDED LINEAR ADDRESS RECORD Byte 1 Header = colon (:) 2..3 The byte count, must be "02" 4..7 Load address, must be "0000" 8..9 Record type, must be "04" 10..13 Upper 16 bits of address of subsequent records 14..15 Checksum in hex notation 16..17 Carriage return, line feed START LINEAR ADDRESS RECORD Byte 1 Header = colon (:) 2..3 The byte count, must be "02" 4..7 Load address, must be "0000" 8..9 Record type, must be "05" 10..13 Upper 16 bits of start address 14..15 Checksum in hex notation 16..17 Carriage return, line feed The MRI compiler uses this, which is a repeat of type 5: EXTENDED START RECORD Byte 1 Header = colon (:) 2..3 The byte count, must be "04" 4..7 Load address, must be "0000" 8..9 Record type, must be "05" 10..13 Upper 16 bits of start address 14..17 Lower 16 bits of start address 18..19 Checksum in hex notation 20..21 Carriage return, line feed. */ #include "sysdep.h" #include "bfd.h" #include "libbfd.h" #include "libiberty.h" #include "safe-ctype.h" /* The number of bytes we put on one line during output. */ #define CHUNK 16 /* Macros for converting between hex and binary. */ #define NIBBLE(x) (hex_value (x)) #define HEX2(buffer) ((NIBBLE ((buffer)[0]) << 4) + NIBBLE ((buffer)[1])) #define HEX4(buffer) ((HEX2 (buffer) << 8) + HEX2 ((buffer) + 2)) #define ISHEX(x) (hex_p (x)) /* When we write out an ihex value, the values can not be output as they are seen. Instead, we hold them in memory in this structure. */ struct ihex_data_list { struct ihex_data_list *next; bfd_byte *data; bfd_vma where; bfd_size_type size; }; /* The ihex tdata information. */ struct ihex_data_struct { struct ihex_data_list *head; struct ihex_data_list *tail; }; /* Initialize by filling in the hex conversion array. */ static void ihex_init (void) { static bfd_boolean inited; if (! inited) { inited = TRUE; hex_init (); } } /* Create an ihex object. */ static bfd_boolean ihex_mkobject (bfd *abfd) { struct ihex_data_struct *tdata; tdata = bfd_alloc (abfd, sizeof (* tdata)); if (tdata == NULL) return FALSE; abfd->tdata.ihex_data = tdata; tdata->head = NULL; tdata->tail = NULL; return TRUE; } /* Read a byte from a BFD. Set *ERRORPTR if an error occurred. Return EOF on error or end of file. */ static INLINE int ihex_get_byte (bfd *abfd, bfd_boolean *errorptr) { bfd_byte c; if (bfd_bread (&c, (bfd_size_type) 1, abfd) != 1) { if (bfd_get_error () != bfd_error_file_truncated) *errorptr = TRUE; return EOF; } return (int) (c & 0xff); } /* Report a problem in an Intel Hex file. */ static void ihex_bad_byte (bfd *abfd, unsigned int lineno, int c, bfd_boolean error) { if (c == EOF) { if (! error) bfd_set_error (bfd_error_file_truncated); } else { char buf[10]; if (! ISPRINT (c)) sprintf (buf, "\\%03o", (unsigned int) c); else { buf[0] = c; buf[1] = '\0'; } (*_bfd_error_handler) (_("%B:%d: unexpected character `%s' in Intel Hex file"), abfd, lineno, buf); bfd_set_error (bfd_error_bad_value); } } /* Read an Intel hex file and turn it into sections. We create a new section for each contiguous set of bytes. */ static bfd_boolean ihex_scan (bfd *abfd) { bfd_vma segbase; bfd_vma extbase; asection *sec; unsigned int lineno; bfd_boolean error; bfd_byte *buf = NULL; size_t bufsize; int c; if (bfd_seek (abfd, (file_ptr) 0, SEEK_SET) != 0) goto error_return; abfd->start_address = 0; segbase = 0; extbase = 0; sec = NULL; lineno = 1; error = FALSE; bufsize = 0; while ((c = ihex_get_byte (abfd, &error)) != EOF) { if (c == '\r') continue; else if (c == '\n') { ++lineno; continue; } else if (c != ':') { ihex_bad_byte (abfd, lineno, c, error); goto error_return; } else { file_ptr pos; char hdr[8]; unsigned int i; unsigned int len; bfd_vma addr; unsigned int type; unsigned int chars; unsigned int chksum; /* This is a data record. */ pos = bfd_tell (abfd) - 1; /* Read the header bytes. */ if (bfd_bread (hdr, (bfd_size_type) 8, abfd) != 8) goto error_return; for (i = 0; i < 8; i++) { if (! ISHEX (hdr[i])) { ihex_bad_byte (abfd, lineno, hdr[i], error); goto error_return; } } len = HEX2 (hdr); addr = HEX4 (hdr + 2); type = HEX2 (hdr + 6); /* Read the data bytes. */ chars = len * 2 + 2; if (chars >= bufsize) { buf = bfd_realloc (buf, (bfd_size_type) chars); if (buf == NULL) goto error_return; bufsize = chars; } if (bfd_bread (buf, (bfd_size_type) chars, abfd) != chars) goto error_return; for (i = 0; i < chars; i++) { if (! ISHEX (buf[i])) { - ihex_bad_byte (abfd, lineno, hdr[i], error); + ihex_bad_byte (abfd, lineno, buf[i], error); goto error_return; } } /* Check the checksum. */ chksum = len + addr + (addr >> 8) + type; for (i = 0; i < len; i++) chksum += HEX2 (buf + 2 * i); if (((- chksum) & 0xff) != (unsigned int) HEX2 (buf + 2 * i)) { (*_bfd_error_handler) (_("%B:%u: bad checksum in Intel Hex file (expected %u, found %u)"), abfd, lineno, (- chksum) & 0xff, (unsigned int) HEX2 (buf + 2 * i)); bfd_set_error (bfd_error_bad_value); goto error_return; } switch (type) { case 0: /* This is a data record. */ if (sec != NULL && sec->vma + sec->size == extbase + segbase + addr) { /* This data goes at the end of the section we are currently building. */ sec->size += len; } else if (len > 0) { char secbuf[20]; char *secname; bfd_size_type amt; flagword flags; sprintf (secbuf, ".sec%d", bfd_count_sections (abfd) + 1); amt = strlen (secbuf) + 1; secname = bfd_alloc (abfd, amt); if (secname == NULL) goto error_return; strcpy (secname, secbuf); flags = SEC_HAS_CONTENTS | SEC_LOAD | SEC_ALLOC; sec = bfd_make_section_with_flags (abfd, secname, flags); if (sec == NULL) goto error_return; sec->vma = extbase + segbase + addr; sec->lma = extbase + segbase + addr; sec->size = len; sec->filepos = pos; } break; case 1: /* An end record. */ if (abfd->start_address == 0) abfd->start_address = addr; if (buf != NULL) free (buf); return TRUE; case 2: /* An extended address record. */ if (len != 2) { (*_bfd_error_handler) (_("%B:%u: bad extended address record length in Intel Hex file"), abfd, lineno); bfd_set_error (bfd_error_bad_value); goto error_return; } segbase = HEX4 (buf) << 4; sec = NULL; break; case 3: /* An extended start address record. */ if (len != 4) { (*_bfd_error_handler) (_("%B:%u: bad extended start address length in Intel Hex file"), abfd, lineno); bfd_set_error (bfd_error_bad_value); goto error_return; } abfd->start_address += (HEX4 (buf) << 4) + HEX4 (buf + 4); sec = NULL; break; case 4: /* An extended linear address record. */ if (len != 2) { (*_bfd_error_handler) (_("%B:%u: bad extended linear address record length in Intel Hex file"), abfd, lineno); bfd_set_error (bfd_error_bad_value); goto error_return; } extbase = HEX4 (buf) << 16; sec = NULL; break; case 5: /* An extended linear start address record. */ if (len != 2 && len != 4) { (*_bfd_error_handler) (_("%B:%u: bad extended linear start address length in Intel Hex file"), abfd, lineno); bfd_set_error (bfd_error_bad_value); goto error_return; } if (len == 2) abfd->start_address += HEX4 (buf) << 16; else abfd->start_address = (HEX4 (buf) << 16) + HEX4 (buf + 4); sec = NULL; break; default: (*_bfd_error_handler) (_("%B:%u: unrecognized ihex type %u in Intel Hex file"), abfd, lineno, type); bfd_set_error (bfd_error_bad_value); goto error_return; } } } if (error) goto error_return; if (buf != NULL) free (buf); return TRUE; error_return: if (buf != NULL) free (buf); return FALSE; } /* Try to recognize an Intel Hex file. */ static const bfd_target * ihex_object_p (bfd *abfd) { void * tdata_save; bfd_byte b[9]; unsigned int i; unsigned int type; ihex_init (); if (bfd_seek (abfd, (file_ptr) 0, SEEK_SET) != 0) return NULL; if (bfd_bread (b, (bfd_size_type) 9, abfd) != 9) { if (bfd_get_error () == bfd_error_file_truncated) bfd_set_error (bfd_error_wrong_format); return NULL; } if (b[0] != ':') { bfd_set_error (bfd_error_wrong_format); return NULL; } for (i = 1; i < 9; i++) { if (! ISHEX (b[i])) { bfd_set_error (bfd_error_wrong_format); return NULL; } } type = HEX2 (b + 7); if (type > 5) { bfd_set_error (bfd_error_wrong_format); return NULL; } /* OK, it looks like it really is an Intel Hex file. */ tdata_save = abfd->tdata.any; if (! ihex_mkobject (abfd) || ! ihex_scan (abfd)) { if (abfd->tdata.any != tdata_save && abfd->tdata.any != NULL) bfd_release (abfd, abfd->tdata.any); abfd->tdata.any = tdata_save; return NULL; } return abfd->xvec; } /* Read the contents of a section in an Intel Hex file. */ static bfd_boolean ihex_read_section (bfd *abfd, asection *section, bfd_byte *contents) { int c; bfd_byte *p; bfd_byte *buf = NULL; size_t bufsize; bfd_boolean error; if (bfd_seek (abfd, section->filepos, SEEK_SET) != 0) goto error_return; p = contents; bufsize = 0; error = FALSE; while ((c = ihex_get_byte (abfd, &error)) != EOF) { char hdr[8]; unsigned int len; unsigned int type; unsigned int i; if (c == '\r' || c == '\n') continue; /* This is called after ihex_scan has succeeded, so we ought to know the exact format. */ BFD_ASSERT (c == ':'); if (bfd_bread (hdr, (bfd_size_type) 8, abfd) != 8) goto error_return; len = HEX2 (hdr); type = HEX2 (hdr + 6); /* We should only see type 0 records here. */ if (type != 0) { (*_bfd_error_handler) (_("%B: internal error in ihex_read_section"), abfd); bfd_set_error (bfd_error_bad_value); goto error_return; } if (len * 2 > bufsize) { buf = bfd_realloc (buf, (bfd_size_type) len * 2); if (buf == NULL) goto error_return; bufsize = len * 2; } if (bfd_bread (buf, (bfd_size_type) len * 2, abfd) != len * 2) goto error_return; for (i = 0; i < len; i++) *p++ = HEX2 (buf + 2 * i); if ((bfd_size_type) (p - contents) >= section->size) { /* We've read everything in the section. */ if (buf != NULL) free (buf); return TRUE; } /* Skip the checksum. */ if (bfd_bread (buf, (bfd_size_type) 2, abfd) != 2) goto error_return; } if ((bfd_size_type) (p - contents) < section->size) { (*_bfd_error_handler) (_("%B: bad section length in ihex_read_section"), abfd); bfd_set_error (bfd_error_bad_value); goto error_return; } if (buf != NULL) free (buf); return TRUE; error_return: if (buf != NULL) free (buf); return FALSE; } /* Get the contents of a section in an Intel Hex file. */ static bfd_boolean ihex_get_section_contents (bfd *abfd, asection *section, void * location, file_ptr offset, bfd_size_type count) { if (section->used_by_bfd == NULL) { section->used_by_bfd = bfd_alloc (abfd, section->size); if (section->used_by_bfd == NULL) return FALSE; if (! ihex_read_section (abfd, section, section->used_by_bfd)) return FALSE; } memcpy (location, (bfd_byte *) section->used_by_bfd + offset, (size_t) count); return TRUE; } /* Set the contents of a section in an Intel Hex file. */ static bfd_boolean ihex_set_section_contents (bfd *abfd, asection *section, const void * location, file_ptr offset, bfd_size_type count) { struct ihex_data_list *n; bfd_byte *data; struct ihex_data_struct *tdata; if (count == 0 || (section->flags & SEC_ALLOC) == 0 || (section->flags & SEC_LOAD) == 0) return TRUE; n = bfd_alloc (abfd, sizeof (* n)); if (n == NULL) return FALSE; data = bfd_alloc (abfd, count); if (data == NULL) return FALSE; memcpy (data, location, (size_t) count); n->data = data; n->where = section->lma + offset; n->size = count; /* Sort the records by address. Optimize for the common case of adding a record to the end of the list. */ tdata = abfd->tdata.ihex_data; if (tdata->tail != NULL && n->where >= tdata->tail->where) { tdata->tail->next = n; n->next = NULL; tdata->tail = n; } else { struct ihex_data_list **pp; for (pp = &tdata->head; *pp != NULL && (*pp)->where < n->where; pp = &(*pp)->next) ; n->next = *pp; *pp = n; if (n->next == NULL) tdata->tail = n; } return TRUE; } /* Write a record out to an Intel Hex file. */ static bfd_boolean ihex_write_record (bfd *abfd, size_t count, unsigned int addr, unsigned int type, bfd_byte *data) { static const char digs[] = "0123456789ABCDEF"; char buf[9 + CHUNK * 2 + 4]; char *p; unsigned int chksum; unsigned int i; size_t total; #define TOHEX(buf, v) \ ((buf)[0] = digs[((v) >> 4) & 0xf], (buf)[1] = digs[(v) & 0xf]) buf[0] = ':'; TOHEX (buf + 1, count); TOHEX (buf + 3, (addr >> 8) & 0xff); TOHEX (buf + 5, addr & 0xff); TOHEX (buf + 7, type); chksum = count + addr + (addr >> 8) + type; for (i = 0, p = buf + 9; i < count; i++, p += 2, data++) { TOHEX (p, *data); chksum += *data; } TOHEX (p, (- chksum) & 0xff); p[2] = '\r'; p[3] = '\n'; total = 9 + count * 2 + 4; if (bfd_bwrite (buf, (bfd_size_type) total, abfd) != total) return FALSE; return TRUE; } /* Write out an Intel Hex file. */ static bfd_boolean ihex_write_object_contents (bfd *abfd) { bfd_vma segbase; bfd_vma extbase; struct ihex_data_list *l; segbase = 0; extbase = 0; for (l = abfd->tdata.ihex_data->head; l != NULL; l = l->next) { bfd_vma where; bfd_byte *p; bfd_size_type count; where = l->where; p = l->data; count = l->size; while (count > 0) { size_t now; unsigned int rec_addr; now = count; if (count > CHUNK) now = CHUNK; if (where > segbase + extbase + 0xffff) { bfd_byte addr[2]; /* We need a new base address. */ if (where <= 0xfffff) { /* The addresses should be sorted. */ BFD_ASSERT (extbase == 0); segbase = where & 0xf0000; addr[0] = (bfd_byte)(segbase >> 12) & 0xff; addr[1] = (bfd_byte)(segbase >> 4) & 0xff; if (! ihex_write_record (abfd, 2, 0, 2, addr)) return FALSE; } else { /* The extended address record and the extended linear address record are combined, at least by some readers. We need an extended linear address record here, so if we've already written out an extended address record, zero it out to avoid confusion. */ if (segbase != 0) { addr[0] = 0; addr[1] = 0; if (! ihex_write_record (abfd, 2, 0, 2, addr)) return FALSE; segbase = 0; } extbase = where & 0xffff0000; if (where > extbase + 0xffff) { char buf[20]; sprintf_vma (buf, where); (*_bfd_error_handler) (_("%s: address 0x%s out of range for Intel Hex file"), bfd_get_filename (abfd), buf); bfd_set_error (bfd_error_bad_value); return FALSE; } addr[0] = (bfd_byte)(extbase >> 24) & 0xff; addr[1] = (bfd_byte)(extbase >> 16) & 0xff; if (! ihex_write_record (abfd, 2, 0, 4, addr)) return FALSE; } } rec_addr = where - (extbase + segbase); /* Output records shouldn't cross 64K boundaries. */ if (rec_addr + now > 0xffff) now = 0x10000 - rec_addr; if (! ihex_write_record (abfd, now, rec_addr, 0, p)) return FALSE; where += now; p += now; count -= now; } } if (abfd->start_address != 0) { bfd_vma start; bfd_byte startbuf[4]; start = abfd->start_address; if (start <= 0xfffff) { startbuf[0] = (bfd_byte)((start & 0xf0000) >> 12) & 0xff; startbuf[1] = 0; startbuf[2] = (bfd_byte)(start >> 8) & 0xff; startbuf[3] = (bfd_byte)start & 0xff; if (! ihex_write_record (abfd, 4, 0, 3, startbuf)) return FALSE; } else { startbuf[0] = (bfd_byte)(start >> 24) & 0xff; startbuf[1] = (bfd_byte)(start >> 16) & 0xff; startbuf[2] = (bfd_byte)(start >> 8) & 0xff; startbuf[3] = (bfd_byte)start & 0xff; if (! ihex_write_record (abfd, 4, 0, 5, startbuf)) return FALSE; } } if (! ihex_write_record (abfd, 0, 0, 1, NULL)) return FALSE; return TRUE; } /* Set the architecture for the output file. The architecture is irrelevant, so we ignore errors about unknown architectures. */ static bfd_boolean ihex_set_arch_mach (bfd *abfd, enum bfd_architecture arch, unsigned long mach) { if (! bfd_default_set_arch_mach (abfd, arch, mach)) { if (arch != bfd_arch_unknown) return FALSE; } return TRUE; } /* Get the size of the headers, for the linker. */ static int ihex_sizeof_headers (bfd *abfd ATTRIBUTE_UNUSED, struct bfd_link_info *info ATTRIBUTE_UNUSED) { return 0; } /* Some random definitions for the target vector. */ #define ihex_close_and_cleanup _bfd_generic_close_and_cleanup #define ihex_bfd_free_cached_info _bfd_generic_bfd_free_cached_info #define ihex_new_section_hook _bfd_generic_new_section_hook #define ihex_get_section_contents_in_window _bfd_generic_get_section_contents_in_window #define ihex_get_symtab_upper_bound bfd_0l #define ihex_canonicalize_symtab ((long (*) (bfd *, asymbol **)) bfd_0l) #define ihex_make_empty_symbol _bfd_generic_make_empty_symbol #define ihex_print_symbol _bfd_nosymbols_print_symbol #define ihex_get_symbol_info _bfd_nosymbols_get_symbol_info #define ihex_bfd_is_target_special_symbol ((bfd_boolean (*) (bfd *, asymbol *)) bfd_false) #define ihex_bfd_is_local_label_name _bfd_nosymbols_bfd_is_local_label_name #define ihex_get_lineno _bfd_nosymbols_get_lineno #define ihex_find_nearest_line _bfd_nosymbols_find_nearest_line #define ihex_find_inliner_info _bfd_nosymbols_find_inliner_info #define ihex_bfd_make_debug_symbol _bfd_nosymbols_bfd_make_debug_symbol #define ihex_read_minisymbols _bfd_nosymbols_read_minisymbols #define ihex_minisymbol_to_symbol _bfd_nosymbols_minisymbol_to_symbol #define ihex_bfd_get_relocated_section_contents bfd_generic_get_relocated_section_contents #define ihex_bfd_relax_section bfd_generic_relax_section #define ihex_bfd_gc_sections bfd_generic_gc_sections #define ihex_bfd_merge_sections bfd_generic_merge_sections #define ihex_bfd_is_group_section bfd_generic_is_group_section #define ihex_bfd_discard_group bfd_generic_discard_group #define ihex_section_already_linked _bfd_generic_section_already_linked #define ihex_bfd_link_hash_table_create _bfd_generic_link_hash_table_create #define ihex_bfd_link_hash_table_free _bfd_generic_link_hash_table_free #define ihex_bfd_link_add_symbols _bfd_generic_link_add_symbols #define ihex_bfd_link_just_syms _bfd_generic_link_just_syms #define ihex_bfd_final_link _bfd_generic_final_link #define ihex_bfd_link_split_section _bfd_generic_link_split_section /* The Intel Hex target vector. */ const bfd_target ihex_vec = { "ihex", /* Name. */ bfd_target_ihex_flavour, BFD_ENDIAN_UNKNOWN, /* Target byte order. */ BFD_ENDIAN_UNKNOWN, /* Target headers byte order. */ 0, /* Object flags. */ (SEC_HAS_CONTENTS | SEC_ALLOC | SEC_LOAD), /* Section flags. */ 0, /* Leading underscore. */ ' ', /* AR_pad_char. */ 16, /* AR_max_namelen. */ bfd_getb64, bfd_getb_signed_64, bfd_putb64, bfd_getb32, bfd_getb_signed_32, bfd_putb32, bfd_getb16, bfd_getb_signed_16, bfd_putb16, /* Data. */ bfd_getb64, bfd_getb_signed_64, bfd_putb64, bfd_getb32, bfd_getb_signed_32, bfd_putb32, bfd_getb16, bfd_getb_signed_16, bfd_putb16, /* Headers. */ { _bfd_dummy_target, ihex_object_p, /* bfd_check_format. */ _bfd_dummy_target, _bfd_dummy_target, }, { bfd_false, ihex_mkobject, _bfd_generic_mkarchive, bfd_false, }, { /* bfd_write_contents. */ bfd_false, ihex_write_object_contents, _bfd_write_archive_contents, bfd_false, }, BFD_JUMP_TABLE_GENERIC (ihex), BFD_JUMP_TABLE_COPY (_bfd_generic), BFD_JUMP_TABLE_CORE (_bfd_nocore), BFD_JUMP_TABLE_ARCHIVE (_bfd_noarchive), BFD_JUMP_TABLE_SYMBOLS (ihex), BFD_JUMP_TABLE_RELOCS (_bfd_norelocs), BFD_JUMP_TABLE_WRITE (ihex), BFD_JUMP_TABLE_LINK (ihex), BFD_JUMP_TABLE_DYNAMIC (_bfd_nodynamic), NULL, NULL }; Index: projects/bsd_rdma_4_9/contrib/binutils/bfd/peXXigen.c =================================================================== --- projects/bsd_rdma_4_9/contrib/binutils/bfd/peXXigen.c (revision 326161) +++ projects/bsd_rdma_4_9/contrib/binutils/bfd/peXXigen.c (revision 326162) @@ -1,2173 +1,2193 @@ /* Support for the generic parts of PE/PEI; the common executable parts. Copyright 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. Written by Cygnus Solutions. This file is part of BFD, the Binary File Descriptor library. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ /* Most of this hacked by Steve Chamberlain . PE/PEI rearrangement (and code added): Donn Terry Softway Systems, Inc. */ /* Hey look, some documentation [and in a place you expect to find it]! The main reference for the pei format is "Microsoft Portable Executable and Common Object File Format Specification 4.1". Get it if you need to do some serious hacking on this code. Another reference: "Peering Inside the PE: A Tour of the Win32 Portable Executable File Format", MSJ 1994, Volume 9. The *sole* difference between the pe format and the pei format is that the latter has an MSDOS 2.0 .exe header on the front that prints the message "This app must be run under Windows." (or some such). (FIXME: Whether that statement is *really* true or not is unknown. Are there more subtle differences between pe and pei formats? For now assume there aren't. If you find one, then for God sakes document it here!) The Microsoft docs use the word "image" instead of "executable" because the former can also refer to a DLL (shared library). Confusion can arise because the `i' in `pei' also refers to "image". The `pe' format can also create images (i.e. executables), it's just that to run on a win32 system you need to use the pei format. FIXME: Please add more docs here so the next poor fool that has to hack on this code has a chance of getting something accomplished without wasting too much time. */ /* This expands into COFF_WITH_pe, COFF_WITH_pep, or COFF_WITH_pex64 depending on whether we're compiling for straight PE or PE+. */ #define COFF_WITH_XX #include "sysdep.h" #include "bfd.h" #include "libbfd.h" #include "coff/internal.h" /* NOTE: it's strange to be including an architecture specific header in what's supposed to be general (to PE/PEI) code. However, that's where the definitions are, and they don't vary per architecture within PE/PEI, so we get them from there. FIXME: The lack of variance is an assumption which may prove to be incorrect if new PE/PEI targets are created. */ #if defined COFF_WITH_pex64 # include "coff/x86_64.h" #elif defined COFF_WITH_pep # include "coff/ia64.h" #else # include "coff/i386.h" #endif #include "coff/pe.h" #include "libcoff.h" #include "libpei.h" #if defined COFF_WITH_pep || defined COFF_WITH_pex64 # undef AOUTSZ # define AOUTSZ PEPAOUTSZ # define PEAOUTHDR PEPAOUTHDR #endif /* FIXME: This file has various tests of POWERPC_LE_PE. Those tests worked when the code was in peicode.h, but no longer work now that the code is in peigen.c. PowerPC NT is said to be dead. If anybody wants to revive the code, you will have to figure out how to handle those issues. */ void _bfd_XXi_swap_sym_in (bfd * abfd, void * ext1, void * in1) { SYMENT *ext = (SYMENT *) ext1; struct internal_syment *in = (struct internal_syment *) in1; if (ext->e.e_name[0] == 0) { in->_n._n_n._n_zeroes = 0; in->_n._n_n._n_offset = H_GET_32 (abfd, ext->e.e.e_offset); } else memcpy (in->_n._n_name, ext->e.e_name, SYMNMLEN); in->n_value = H_GET_32 (abfd, ext->e_value); in->n_scnum = H_GET_16 (abfd, ext->e_scnum); if (sizeof (ext->e_type) == 2) in->n_type = H_GET_16 (abfd, ext->e_type); else in->n_type = H_GET_32 (abfd, ext->e_type); in->n_sclass = H_GET_8 (abfd, ext->e_sclass); in->n_numaux = H_GET_8 (abfd, ext->e_numaux); #ifndef STRICT_PE_FORMAT /* This is for Gnu-created DLLs. */ /* The section symbols for the .idata$ sections have class 0x68 (C_SECTION), which MS documentation indicates is a section symbol. Unfortunately, the value field in the symbol is simply a copy of the .idata section's flags rather than something useful. When these symbols are encountered, change the value to 0 so that they will be handled somewhat correctly in the bfd code. */ if (in->n_sclass == C_SECTION) { in->n_value = 0x0; /* Create synthetic empty sections as needed. DJ */ if (in->n_scnum == 0) { asection *sec; for (sec = abfd->sections; sec; sec = sec->next) { if (strcmp (sec->name, in->n_name) == 0) { in->n_scnum = sec->target_index; break; } } } if (in->n_scnum == 0) { int unused_section_number = 0; asection *sec; char *name; flagword flags; for (sec = abfd->sections; sec; sec = sec->next) if (unused_section_number <= sec->target_index) unused_section_number = sec->target_index + 1; name = bfd_alloc (abfd, (bfd_size_type) strlen (in->n_name) + 10); if (name == NULL) return; strcpy (name, in->n_name); flags = SEC_HAS_CONTENTS | SEC_ALLOC | SEC_DATA | SEC_LOAD; sec = bfd_make_section_anyway_with_flags (abfd, name, flags); sec->vma = 0; sec->lma = 0; sec->size = 0; sec->filepos = 0; sec->rel_filepos = 0; sec->reloc_count = 0; sec->line_filepos = 0; sec->lineno_count = 0; sec->userdata = NULL; sec->next = NULL; sec->alignment_power = 2; sec->target_index = unused_section_number; in->n_scnum = unused_section_number; } in->n_sclass = C_STAT; } #endif #ifdef coff_swap_sym_in_hook /* This won't work in peigen.c, but since it's for PPC PE, it's not worth fixing. */ coff_swap_sym_in_hook (abfd, ext1, in1); #endif } unsigned int _bfd_XXi_swap_sym_out (bfd * abfd, void * inp, void * extp) { struct internal_syment *in = (struct internal_syment *) inp; SYMENT *ext = (SYMENT *) extp; if (in->_n._n_name[0] == 0) { H_PUT_32 (abfd, 0, ext->e.e.e_zeroes); H_PUT_32 (abfd, in->_n._n_n._n_offset, ext->e.e.e_offset); } else memcpy (ext->e.e_name, in->_n._n_name, SYMNMLEN); H_PUT_32 (abfd, in->n_value, ext->e_value); H_PUT_16 (abfd, in->n_scnum, ext->e_scnum); if (sizeof (ext->e_type) == 2) H_PUT_16 (abfd, in->n_type, ext->e_type); else H_PUT_32 (abfd, in->n_type, ext->e_type); H_PUT_8 (abfd, in->n_sclass, ext->e_sclass); H_PUT_8 (abfd, in->n_numaux, ext->e_numaux); return SYMESZ; } void _bfd_XXi_swap_aux_in (bfd * abfd, void * ext1, int type, int class, int indx ATTRIBUTE_UNUSED, int numaux ATTRIBUTE_UNUSED, void * in1) { AUXENT *ext = (AUXENT *) ext1; union internal_auxent *in = (union internal_auxent *) in1; switch (class) { case C_FILE: if (ext->x_file.x_fname[0] == 0) { in->x_file.x_n.x_zeroes = 0; in->x_file.x_n.x_offset = H_GET_32 (abfd, ext->x_file.x_n.x_offset); } else memcpy (in->x_file.x_fname, ext->x_file.x_fname, FILNMLEN); return; case C_STAT: case C_LEAFSTAT: case C_HIDDEN: if (type == T_NULL) { in->x_scn.x_scnlen = GET_SCN_SCNLEN (abfd, ext); in->x_scn.x_nreloc = GET_SCN_NRELOC (abfd, ext); in->x_scn.x_nlinno = GET_SCN_NLINNO (abfd, ext); in->x_scn.x_checksum = H_GET_32 (abfd, ext->x_scn.x_checksum); in->x_scn.x_associated = H_GET_16 (abfd, ext->x_scn.x_associated); in->x_scn.x_comdat = H_GET_8 (abfd, ext->x_scn.x_comdat); return; } break; } in->x_sym.x_tagndx.l = H_GET_32 (abfd, ext->x_sym.x_tagndx); in->x_sym.x_tvndx = H_GET_16 (abfd, ext->x_sym.x_tvndx); if (class == C_BLOCK || class == C_FCN || ISFCN (type) || ISTAG (class)) { in->x_sym.x_fcnary.x_fcn.x_lnnoptr = GET_FCN_LNNOPTR (abfd, ext); in->x_sym.x_fcnary.x_fcn.x_endndx.l = GET_FCN_ENDNDX (abfd, ext); } else { in->x_sym.x_fcnary.x_ary.x_dimen[0] = H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[0]); in->x_sym.x_fcnary.x_ary.x_dimen[1] = H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[1]); in->x_sym.x_fcnary.x_ary.x_dimen[2] = H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[2]); in->x_sym.x_fcnary.x_ary.x_dimen[3] = H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[3]); } if (ISFCN (type)) { in->x_sym.x_misc.x_fsize = H_GET_32 (abfd, ext->x_sym.x_misc.x_fsize); } else { in->x_sym.x_misc.x_lnsz.x_lnno = GET_LNSZ_LNNO (abfd, ext); in->x_sym.x_misc.x_lnsz.x_size = GET_LNSZ_SIZE (abfd, ext); } } unsigned int _bfd_XXi_swap_aux_out (bfd * abfd, void * inp, int type, int class, int indx ATTRIBUTE_UNUSED, int numaux ATTRIBUTE_UNUSED, void * extp) { union internal_auxent *in = (union internal_auxent *) inp; AUXENT *ext = (AUXENT *) extp; memset (ext, 0, AUXESZ); switch (class) { case C_FILE: if (in->x_file.x_fname[0] == 0) { H_PUT_32 (abfd, 0, ext->x_file.x_n.x_zeroes); H_PUT_32 (abfd, in->x_file.x_n.x_offset, ext->x_file.x_n.x_offset); } else memcpy (ext->x_file.x_fname, in->x_file.x_fname, FILNMLEN); return AUXESZ; case C_STAT: case C_LEAFSTAT: case C_HIDDEN: if (type == T_NULL) { PUT_SCN_SCNLEN (abfd, in->x_scn.x_scnlen, ext); PUT_SCN_NRELOC (abfd, in->x_scn.x_nreloc, ext); PUT_SCN_NLINNO (abfd, in->x_scn.x_nlinno, ext); H_PUT_32 (abfd, in->x_scn.x_checksum, ext->x_scn.x_checksum); H_PUT_16 (abfd, in->x_scn.x_associated, ext->x_scn.x_associated); H_PUT_8 (abfd, in->x_scn.x_comdat, ext->x_scn.x_comdat); return AUXESZ; } break; } H_PUT_32 (abfd, in->x_sym.x_tagndx.l, ext->x_sym.x_tagndx); H_PUT_16 (abfd, in->x_sym.x_tvndx, ext->x_sym.x_tvndx); if (class == C_BLOCK || class == C_FCN || ISFCN (type) || ISTAG (class)) { PUT_FCN_LNNOPTR (abfd, in->x_sym.x_fcnary.x_fcn.x_lnnoptr, ext); PUT_FCN_ENDNDX (abfd, in->x_sym.x_fcnary.x_fcn.x_endndx.l, ext); } else { H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[0], ext->x_sym.x_fcnary.x_ary.x_dimen[0]); H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[1], ext->x_sym.x_fcnary.x_ary.x_dimen[1]); H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[2], ext->x_sym.x_fcnary.x_ary.x_dimen[2]); H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[3], ext->x_sym.x_fcnary.x_ary.x_dimen[3]); } if (ISFCN (type)) H_PUT_32 (abfd, in->x_sym.x_misc.x_fsize, ext->x_sym.x_misc.x_fsize); else { PUT_LNSZ_LNNO (abfd, in->x_sym.x_misc.x_lnsz.x_lnno, ext); PUT_LNSZ_SIZE (abfd, in->x_sym.x_misc.x_lnsz.x_size, ext); } return AUXESZ; } void _bfd_XXi_swap_lineno_in (bfd * abfd, void * ext1, void * in1) { LINENO *ext = (LINENO *) ext1; struct internal_lineno *in = (struct internal_lineno *) in1; in->l_addr.l_symndx = H_GET_32 (abfd, ext->l_addr.l_symndx); in->l_lnno = GET_LINENO_LNNO (abfd, ext); } unsigned int _bfd_XXi_swap_lineno_out (bfd * abfd, void * inp, void * outp) { struct internal_lineno *in = (struct internal_lineno *) inp; struct external_lineno *ext = (struct external_lineno *) outp; H_PUT_32 (abfd, in->l_addr.l_symndx, ext->l_addr.l_symndx); PUT_LINENO_LNNO (abfd, in->l_lnno, ext); return LINESZ; } void _bfd_XXi_swap_aouthdr_in (bfd * abfd, void * aouthdr_ext1, void * aouthdr_int1) { PEAOUTHDR * src = (PEAOUTHDR *) aouthdr_ext1; AOUTHDR * aouthdr_ext = (AOUTHDR *) aouthdr_ext1; struct internal_aouthdr *aouthdr_int = (struct internal_aouthdr *) aouthdr_int1; struct internal_extra_pe_aouthdr *a = &aouthdr_int->pe; aouthdr_int->magic = H_GET_16 (abfd, aouthdr_ext->magic); aouthdr_int->vstamp = H_GET_16 (abfd, aouthdr_ext->vstamp); aouthdr_int->tsize = GET_AOUTHDR_TSIZE (abfd, aouthdr_ext->tsize); aouthdr_int->dsize = GET_AOUTHDR_DSIZE (abfd, aouthdr_ext->dsize); aouthdr_int->bsize = GET_AOUTHDR_BSIZE (abfd, aouthdr_ext->bsize); aouthdr_int->entry = GET_AOUTHDR_ENTRY (abfd, aouthdr_ext->entry); aouthdr_int->text_start = GET_AOUTHDR_TEXT_START (abfd, aouthdr_ext->text_start); #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) /* PE32+ does not have data_start member! */ aouthdr_int->data_start = GET_AOUTHDR_DATA_START (abfd, aouthdr_ext->data_start); a->BaseOfData = aouthdr_int->data_start; #endif a->Magic = aouthdr_int->magic; a->MajorLinkerVersion = H_GET_8 (abfd, aouthdr_ext->vstamp); a->MinorLinkerVersion = H_GET_8 (abfd, aouthdr_ext->vstamp + 1); a->SizeOfCode = aouthdr_int->tsize ; a->SizeOfInitializedData = aouthdr_int->dsize ; a->SizeOfUninitializedData = aouthdr_int->bsize ; a->AddressOfEntryPoint = aouthdr_int->entry; a->BaseOfCode = aouthdr_int->text_start; a->ImageBase = GET_OPTHDR_IMAGE_BASE (abfd, src->ImageBase); a->SectionAlignment = H_GET_32 (abfd, src->SectionAlignment); a->FileAlignment = H_GET_32 (abfd, src->FileAlignment); a->MajorOperatingSystemVersion = H_GET_16 (abfd, src->MajorOperatingSystemVersion); a->MinorOperatingSystemVersion = H_GET_16 (abfd, src->MinorOperatingSystemVersion); a->MajorImageVersion = H_GET_16 (abfd, src->MajorImageVersion); a->MinorImageVersion = H_GET_16 (abfd, src->MinorImageVersion); a->MajorSubsystemVersion = H_GET_16 (abfd, src->MajorSubsystemVersion); a->MinorSubsystemVersion = H_GET_16 (abfd, src->MinorSubsystemVersion); a->Reserved1 = H_GET_32 (abfd, src->Reserved1); a->SizeOfImage = H_GET_32 (abfd, src->SizeOfImage); a->SizeOfHeaders = H_GET_32 (abfd, src->SizeOfHeaders); a->CheckSum = H_GET_32 (abfd, src->CheckSum); a->Subsystem = H_GET_16 (abfd, src->Subsystem); a->DllCharacteristics = H_GET_16 (abfd, src->DllCharacteristics); a->SizeOfStackReserve = GET_OPTHDR_SIZE_OF_STACK_RESERVE (abfd, src->SizeOfStackReserve); a->SizeOfStackCommit = GET_OPTHDR_SIZE_OF_STACK_COMMIT (abfd, src->SizeOfStackCommit); a->SizeOfHeapReserve = GET_OPTHDR_SIZE_OF_HEAP_RESERVE (abfd, src->SizeOfHeapReserve); a->SizeOfHeapCommit = GET_OPTHDR_SIZE_OF_HEAP_COMMIT (abfd, src->SizeOfHeapCommit); a->LoaderFlags = H_GET_32 (abfd, src->LoaderFlags); a->NumberOfRvaAndSizes = H_GET_32 (abfd, src->NumberOfRvaAndSizes); { int idx; + /* PR 17512: Corrupt PE binaries can cause seg-faults. */ + if (a->NumberOfRvaAndSizes > 16) + { + (*_bfd_error_handler) + (_("%B: aout header specifies an invalid number of data-directory entries: %d"), + abfd, a->NumberOfRvaAndSizes); + /* Paranoia: If the number is corrupt, then assume that the + actual entries themselves might be corrupt as well. */ + a->NumberOfRvaAndSizes = 0; + } + for (idx = 0; idx < 16; idx++) { /* If data directory is empty, rva also should be 0. */ int size = H_GET_32 (abfd, src->DataDirectory[idx][1]); a->DataDirectory[idx].Size = size; if (size) a->DataDirectory[idx].VirtualAddress = H_GET_32 (abfd, src->DataDirectory[idx][0]); else a->DataDirectory[idx].VirtualAddress = 0; } } if (aouthdr_int->entry) { aouthdr_int->entry += a->ImageBase; #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) aouthdr_int->entry &= 0xffffffff; #endif } if (aouthdr_int->tsize) { aouthdr_int->text_start += a->ImageBase; #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) aouthdr_int->text_start &= 0xffffffff; #endif } #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) /* PE32+ does not have data_start member! */ if (aouthdr_int->dsize) { aouthdr_int->data_start += a->ImageBase; aouthdr_int->data_start &= 0xffffffff; } #endif #ifdef POWERPC_LE_PE /* These three fields are normally set up by ppc_relocate_section. In the case of reading a file in, we can pick them up from the DataDirectory. */ first_thunk_address = a->DataDirectory[PE_IMPORT_ADDRESS_TABLE].VirtualAddress; thunk_size = a->DataDirectory[PE_IMPORT_ADDRESS_TABLE].Size; import_table_size = a->DataDirectory[PE_IMPORT_TABLE].Size; #endif } /* A support function for below. */ static void add_data_entry (bfd * abfd, struct internal_extra_pe_aouthdr *aout, int idx, char *name, bfd_vma base) { asection *sec = bfd_get_section_by_name (abfd, name); /* Add import directory information if it exists. */ if ((sec != NULL) && (coff_section_data (abfd, sec) != NULL) && (pei_section_data (abfd, sec) != NULL)) { /* If data directory is empty, rva also should be 0. */ int size = pei_section_data (abfd, sec)->virt_size; aout->DataDirectory[idx].Size = size; if (size) { aout->DataDirectory[idx].VirtualAddress = (sec->vma - base) & 0xffffffff; sec->flags |= SEC_DATA; } } } unsigned int _bfd_XXi_swap_aouthdr_out (bfd * abfd, void * in, void * out) { struct internal_aouthdr *aouthdr_in = (struct internal_aouthdr *) in; pe_data_type *pe = pe_data (abfd); struct internal_extra_pe_aouthdr *extra = &pe->pe_opthdr; PEAOUTHDR *aouthdr_out = (PEAOUTHDR *) out; bfd_vma sa, fa, ib; IMAGE_DATA_DIRECTORY idata2, idata5, tls; if (pe->force_minimum_alignment) { if (!extra->FileAlignment) extra->FileAlignment = PE_DEF_FILE_ALIGNMENT; if (!extra->SectionAlignment) extra->SectionAlignment = PE_DEF_SECTION_ALIGNMENT; } if (extra->Subsystem == IMAGE_SUBSYSTEM_UNKNOWN) extra->Subsystem = pe->target_subsystem; sa = extra->SectionAlignment; fa = extra->FileAlignment; ib = extra->ImageBase; idata2 = pe->pe_opthdr.DataDirectory[PE_IMPORT_TABLE]; idata5 = pe->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE]; tls = pe->pe_opthdr.DataDirectory[PE_TLS_TABLE]; if (aouthdr_in->tsize) { aouthdr_in->text_start -= ib; #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) aouthdr_in->text_start &= 0xffffffff; #endif } if (aouthdr_in->dsize) { aouthdr_in->data_start -= ib; #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) aouthdr_in->data_start &= 0xffffffff; #endif } if (aouthdr_in->entry) { aouthdr_in->entry -= ib; #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) aouthdr_in->entry &= 0xffffffff; #endif } #define FA(x) (((x) + fa -1 ) & (- fa)) #define SA(x) (((x) + sa -1 ) & (- sa)) /* We like to have the sizes aligned. */ aouthdr_in->bsize = FA (aouthdr_in->bsize); extra->NumberOfRvaAndSizes = IMAGE_NUMBEROF_DIRECTORY_ENTRIES; /* First null out all data directory entries. */ memset (extra->DataDirectory, 0, sizeof (extra->DataDirectory)); add_data_entry (abfd, extra, 0, ".edata", ib); add_data_entry (abfd, extra, 2, ".rsrc", ib); add_data_entry (abfd, extra, 3, ".pdata", ib); /* In theory we do not need to call add_data_entry for .idata$2 or .idata$5. It will be done in bfd_coff_final_link where all the required information is available. If however, we are not going to perform a final link, eg because we have been invoked by objcopy or strip, then we need to make sure that these Data Directory entries are initialised properly. So - we copy the input values into the output values, and then, if a final link is going to be performed, it can overwrite them. */ extra->DataDirectory[PE_IMPORT_TABLE] = idata2; extra->DataDirectory[PE_IMPORT_ADDRESS_TABLE] = idata5; extra->DataDirectory[PE_TLS_TABLE] = tls; if (extra->DataDirectory[PE_IMPORT_TABLE].VirtualAddress == 0) /* Until other .idata fixes are made (pending patch), the entry for .idata is needed for backwards compatibility. FIXME. */ add_data_entry (abfd, extra, 1, ".idata", ib); /* For some reason, the virtual size (which is what's set by add_data_entry) for .reloc is not the same as the size recorded in this slot by MSVC; it doesn't seem to cause problems (so far), but since it's the best we've got, use it. It does do the right thing for .pdata. */ if (pe->has_reloc_section) add_data_entry (abfd, extra, 5, ".reloc", ib); { asection *sec; bfd_vma hsize = 0; bfd_vma dsize = 0; bfd_vma isize = 0; bfd_vma tsize = 0; for (sec = abfd->sections; sec; sec = sec->next) { int rounded = FA (sec->size); /* The first non-zero section filepos is the header size. Sections without contents will have a filepos of 0. */ if (hsize == 0) hsize = sec->filepos; if (sec->flags & SEC_DATA) dsize += rounded; if (sec->flags & SEC_CODE) tsize += rounded; /* The image size is the total VIRTUAL size (which is what is in the virt_size field). Files have been seen (from MSVC 5.0 link.exe) where the file size of the .data segment is quite small compared to the virtual size. Without this fix, strip munges the file. FIXME: We need to handle holes between sections, which may happpen when we covert from another format. We just use the virtual address and virtual size of the last section for the image size. */ if (coff_section_data (abfd, sec) != NULL && pei_section_data (abfd, sec) != NULL) isize = (sec->vma - extra->ImageBase + SA (FA (pei_section_data (abfd, sec)->virt_size))); } aouthdr_in->dsize = dsize; aouthdr_in->tsize = tsize; extra->SizeOfHeaders = hsize; extra->SizeOfImage = isize; } H_PUT_16 (abfd, aouthdr_in->magic, aouthdr_out->standard.magic); #define LINKER_VERSION 256 /* That is, 2.56 */ /* This piece of magic sets the "linker version" field to LINKER_VERSION. */ H_PUT_16 (abfd, (LINKER_VERSION / 100 + (LINKER_VERSION % 100) * 256), aouthdr_out->standard.vstamp); PUT_AOUTHDR_TSIZE (abfd, aouthdr_in->tsize, aouthdr_out->standard.tsize); PUT_AOUTHDR_DSIZE (abfd, aouthdr_in->dsize, aouthdr_out->standard.dsize); PUT_AOUTHDR_BSIZE (abfd, aouthdr_in->bsize, aouthdr_out->standard.bsize); PUT_AOUTHDR_ENTRY (abfd, aouthdr_in->entry, aouthdr_out->standard.entry); PUT_AOUTHDR_TEXT_START (abfd, aouthdr_in->text_start, aouthdr_out->standard.text_start); #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) /* PE32+ does not have data_start member! */ PUT_AOUTHDR_DATA_START (abfd, aouthdr_in->data_start, aouthdr_out->standard.data_start); #endif PUT_OPTHDR_IMAGE_BASE (abfd, extra->ImageBase, aouthdr_out->ImageBase); H_PUT_32 (abfd, extra->SectionAlignment, aouthdr_out->SectionAlignment); H_PUT_32 (abfd, extra->FileAlignment, aouthdr_out->FileAlignment); H_PUT_16 (abfd, extra->MajorOperatingSystemVersion, aouthdr_out->MajorOperatingSystemVersion); H_PUT_16 (abfd, extra->MinorOperatingSystemVersion, aouthdr_out->MinorOperatingSystemVersion); H_PUT_16 (abfd, extra->MajorImageVersion, aouthdr_out->MajorImageVersion); H_PUT_16 (abfd, extra->MinorImageVersion, aouthdr_out->MinorImageVersion); H_PUT_16 (abfd, extra->MajorSubsystemVersion, aouthdr_out->MajorSubsystemVersion); H_PUT_16 (abfd, extra->MinorSubsystemVersion, aouthdr_out->MinorSubsystemVersion); H_PUT_32 (abfd, extra->Reserved1, aouthdr_out->Reserved1); H_PUT_32 (abfd, extra->SizeOfImage, aouthdr_out->SizeOfImage); H_PUT_32 (abfd, extra->SizeOfHeaders, aouthdr_out->SizeOfHeaders); H_PUT_32 (abfd, extra->CheckSum, aouthdr_out->CheckSum); H_PUT_16 (abfd, extra->Subsystem, aouthdr_out->Subsystem); H_PUT_16 (abfd, extra->DllCharacteristics, aouthdr_out->DllCharacteristics); PUT_OPTHDR_SIZE_OF_STACK_RESERVE (abfd, extra->SizeOfStackReserve, aouthdr_out->SizeOfStackReserve); PUT_OPTHDR_SIZE_OF_STACK_COMMIT (abfd, extra->SizeOfStackCommit, aouthdr_out->SizeOfStackCommit); PUT_OPTHDR_SIZE_OF_HEAP_RESERVE (abfd, extra->SizeOfHeapReserve, aouthdr_out->SizeOfHeapReserve); PUT_OPTHDR_SIZE_OF_HEAP_COMMIT (abfd, extra->SizeOfHeapCommit, aouthdr_out->SizeOfHeapCommit); H_PUT_32 (abfd, extra->LoaderFlags, aouthdr_out->LoaderFlags); H_PUT_32 (abfd, extra->NumberOfRvaAndSizes, aouthdr_out->NumberOfRvaAndSizes); { int idx; for (idx = 0; idx < 16; idx++) { H_PUT_32 (abfd, extra->DataDirectory[idx].VirtualAddress, aouthdr_out->DataDirectory[idx][0]); H_PUT_32 (abfd, extra->DataDirectory[idx].Size, aouthdr_out->DataDirectory[idx][1]); } } return AOUTSZ; } unsigned int _bfd_XXi_only_swap_filehdr_out (bfd * abfd, void * in, void * out) { int idx; struct internal_filehdr *filehdr_in = (struct internal_filehdr *) in; struct external_PEI_filehdr *filehdr_out = (struct external_PEI_filehdr *) out; if (pe_data (abfd)->has_reloc_section) filehdr_in->f_flags &= ~F_RELFLG; if (pe_data (abfd)->dll) filehdr_in->f_flags |= F_DLL; filehdr_in->pe.e_magic = DOSMAGIC; filehdr_in->pe.e_cblp = 0x90; filehdr_in->pe.e_cp = 0x3; filehdr_in->pe.e_crlc = 0x0; filehdr_in->pe.e_cparhdr = 0x4; filehdr_in->pe.e_minalloc = 0x0; filehdr_in->pe.e_maxalloc = 0xffff; filehdr_in->pe.e_ss = 0x0; filehdr_in->pe.e_sp = 0xb8; filehdr_in->pe.e_csum = 0x0; filehdr_in->pe.e_ip = 0x0; filehdr_in->pe.e_cs = 0x0; filehdr_in->pe.e_lfarlc = 0x40; filehdr_in->pe.e_ovno = 0x0; for (idx = 0; idx < 4; idx++) filehdr_in->pe.e_res[idx] = 0x0; filehdr_in->pe.e_oemid = 0x0; filehdr_in->pe.e_oeminfo = 0x0; for (idx = 0; idx < 10; idx++) filehdr_in->pe.e_res2[idx] = 0x0; filehdr_in->pe.e_lfanew = 0x80; /* This next collection of data are mostly just characters. It appears to be constant within the headers put on NT exes. */ filehdr_in->pe.dos_message[0] = 0x0eba1f0e; filehdr_in->pe.dos_message[1] = 0xcd09b400; filehdr_in->pe.dos_message[2] = 0x4c01b821; filehdr_in->pe.dos_message[3] = 0x685421cd; filehdr_in->pe.dos_message[4] = 0x70207369; filehdr_in->pe.dos_message[5] = 0x72676f72; filehdr_in->pe.dos_message[6] = 0x63206d61; filehdr_in->pe.dos_message[7] = 0x6f6e6e61; filehdr_in->pe.dos_message[8] = 0x65622074; filehdr_in->pe.dos_message[9] = 0x6e757220; filehdr_in->pe.dos_message[10] = 0x206e6920; filehdr_in->pe.dos_message[11] = 0x20534f44; filehdr_in->pe.dos_message[12] = 0x65646f6d; filehdr_in->pe.dos_message[13] = 0x0a0d0d2e; filehdr_in->pe.dos_message[14] = 0x24; filehdr_in->pe.dos_message[15] = 0x0; filehdr_in->pe.nt_signature = NT_SIGNATURE; H_PUT_16 (abfd, filehdr_in->f_magic, filehdr_out->f_magic); H_PUT_16 (abfd, filehdr_in->f_nscns, filehdr_out->f_nscns); H_PUT_32 (abfd, time (0), filehdr_out->f_timdat); PUT_FILEHDR_SYMPTR (abfd, filehdr_in->f_symptr, filehdr_out->f_symptr); H_PUT_32 (abfd, filehdr_in->f_nsyms, filehdr_out->f_nsyms); H_PUT_16 (abfd, filehdr_in->f_opthdr, filehdr_out->f_opthdr); H_PUT_16 (abfd, filehdr_in->f_flags, filehdr_out->f_flags); /* Put in extra dos header stuff. This data remains essentially constant, it just has to be tacked on to the beginning of all exes for NT. */ H_PUT_16 (abfd, filehdr_in->pe.e_magic, filehdr_out->e_magic); H_PUT_16 (abfd, filehdr_in->pe.e_cblp, filehdr_out->e_cblp); H_PUT_16 (abfd, filehdr_in->pe.e_cp, filehdr_out->e_cp); H_PUT_16 (abfd, filehdr_in->pe.e_crlc, filehdr_out->e_crlc); H_PUT_16 (abfd, filehdr_in->pe.e_cparhdr, filehdr_out->e_cparhdr); H_PUT_16 (abfd, filehdr_in->pe.e_minalloc, filehdr_out->e_minalloc); H_PUT_16 (abfd, filehdr_in->pe.e_maxalloc, filehdr_out->e_maxalloc); H_PUT_16 (abfd, filehdr_in->pe.e_ss, filehdr_out->e_ss); H_PUT_16 (abfd, filehdr_in->pe.e_sp, filehdr_out->e_sp); H_PUT_16 (abfd, filehdr_in->pe.e_csum, filehdr_out->e_csum); H_PUT_16 (abfd, filehdr_in->pe.e_ip, filehdr_out->e_ip); H_PUT_16 (abfd, filehdr_in->pe.e_cs, filehdr_out->e_cs); H_PUT_16 (abfd, filehdr_in->pe.e_lfarlc, filehdr_out->e_lfarlc); H_PUT_16 (abfd, filehdr_in->pe.e_ovno, filehdr_out->e_ovno); for (idx = 0; idx < 4; idx++) H_PUT_16 (abfd, filehdr_in->pe.e_res[idx], filehdr_out->e_res[idx]); H_PUT_16 (abfd, filehdr_in->pe.e_oemid, filehdr_out->e_oemid); H_PUT_16 (abfd, filehdr_in->pe.e_oeminfo, filehdr_out->e_oeminfo); for (idx = 0; idx < 10; idx++) H_PUT_16 (abfd, filehdr_in->pe.e_res2[idx], filehdr_out->e_res2[idx]); H_PUT_32 (abfd, filehdr_in->pe.e_lfanew, filehdr_out->e_lfanew); for (idx = 0; idx < 16; idx++) H_PUT_32 (abfd, filehdr_in->pe.dos_message[idx], filehdr_out->dos_message[idx]); /* Also put in the NT signature. */ H_PUT_32 (abfd, filehdr_in->pe.nt_signature, filehdr_out->nt_signature); return FILHSZ; } unsigned int _bfd_XX_only_swap_filehdr_out (bfd * abfd, void * in, void * out) { struct internal_filehdr *filehdr_in = (struct internal_filehdr *) in; FILHDR *filehdr_out = (FILHDR *) out; H_PUT_16 (abfd, filehdr_in->f_magic, filehdr_out->f_magic); H_PUT_16 (abfd, filehdr_in->f_nscns, filehdr_out->f_nscns); H_PUT_32 (abfd, filehdr_in->f_timdat, filehdr_out->f_timdat); PUT_FILEHDR_SYMPTR (abfd, filehdr_in->f_symptr, filehdr_out->f_symptr); H_PUT_32 (abfd, filehdr_in->f_nsyms, filehdr_out->f_nsyms); H_PUT_16 (abfd, filehdr_in->f_opthdr, filehdr_out->f_opthdr); H_PUT_16 (abfd, filehdr_in->f_flags, filehdr_out->f_flags); return FILHSZ; } unsigned int _bfd_XXi_swap_scnhdr_out (bfd * abfd, void * in, void * out) { struct internal_scnhdr *scnhdr_int = (struct internal_scnhdr *) in; SCNHDR *scnhdr_ext = (SCNHDR *) out; unsigned int ret = SCNHSZ; bfd_vma ps; bfd_vma ss; memcpy (scnhdr_ext->s_name, scnhdr_int->s_name, sizeof (scnhdr_int->s_name)); PUT_SCNHDR_VADDR (abfd, ((scnhdr_int->s_vaddr - pe_data (abfd)->pe_opthdr.ImageBase) & 0xffffffff), scnhdr_ext->s_vaddr); /* NT wants the size data to be rounded up to the next NT_FILE_ALIGNMENT, but zero if it has no content (as in .bss, sometimes). */ if ((scnhdr_int->s_flags & IMAGE_SCN_CNT_UNINITIALIZED_DATA) != 0) { if (bfd_pe_executable_p (abfd)) { ps = scnhdr_int->s_size; ss = 0; } else { ps = 0; ss = scnhdr_int->s_size; } } else { if (bfd_pe_executable_p (abfd)) ps = scnhdr_int->s_paddr; else ps = 0; ss = scnhdr_int->s_size; } PUT_SCNHDR_SIZE (abfd, ss, scnhdr_ext->s_size); /* s_paddr in PE is really the virtual size. */ PUT_SCNHDR_PADDR (abfd, ps, scnhdr_ext->s_paddr); PUT_SCNHDR_SCNPTR (abfd, scnhdr_int->s_scnptr, scnhdr_ext->s_scnptr); PUT_SCNHDR_RELPTR (abfd, scnhdr_int->s_relptr, scnhdr_ext->s_relptr); PUT_SCNHDR_LNNOPTR (abfd, scnhdr_int->s_lnnoptr, scnhdr_ext->s_lnnoptr); { /* Extra flags must be set when dealing with PE. All sections should also have the IMAGE_SCN_MEM_READ (0x40000000) flag set. In addition, the .text section must have IMAGE_SCN_MEM_EXECUTE (0x20000000) and the data sections (.idata, .data, .bss, .CRT) must have IMAGE_SCN_MEM_WRITE set (this is especially important when dealing with the .idata section since the addresses for routines from .dlls must be overwritten). If .reloc section data is ever generated, we must add IMAGE_SCN_MEM_DISCARDABLE (0x02000000). Also, the resource data should also be read and writable. */ /* FIXME: Alignment is also encoded in this field, at least on PPC and ARM-WINCE. Although - how do we get the original alignment field back ? */ typedef struct { const char * section_name; unsigned long must_have; } pe_required_section_flags; pe_required_section_flags known_sections [] = { { ".arch", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_DISCARDABLE | IMAGE_SCN_ALIGN_8BYTES }, { ".bss", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_UNINITIALIZED_DATA | IMAGE_SCN_MEM_WRITE }, { ".data", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE }, { ".edata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA }, { ".idata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE }, { ".pdata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA }, { ".rdata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA }, { ".reloc", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_DISCARDABLE }, { ".rsrc", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE }, { ".text" , IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_CODE | IMAGE_SCN_MEM_EXECUTE }, { ".tls", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE }, { ".xdata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA }, { NULL, 0} }; pe_required_section_flags * p; /* We have defaulted to adding the IMAGE_SCN_MEM_WRITE flag, but now we know exactly what this specific section wants so we remove it and then allow the must_have field to add it back in if necessary. However, we don't remove IMAGE_SCN_MEM_WRITE flag from .text if the default WP_TEXT file flag has been cleared. WP_TEXT may be cleared by ld --enable-auto-import (if auto-import is actually needed), by ld --omagic, or by obcopy --writable-text. */ for (p = known_sections; p->section_name; p++) if (strcmp (scnhdr_int->s_name, p->section_name) == 0) { if (strcmp (scnhdr_int->s_name, ".text") || (bfd_get_file_flags (abfd) & WP_TEXT)) scnhdr_int->s_flags &= ~IMAGE_SCN_MEM_WRITE; scnhdr_int->s_flags |= p->must_have; break; } H_PUT_32 (abfd, scnhdr_int->s_flags, scnhdr_ext->s_flags); } if (coff_data (abfd)->link_info && ! coff_data (abfd)->link_info->relocatable && ! coff_data (abfd)->link_info->shared && strcmp (scnhdr_int->s_name, ".text") == 0) { /* By inference from looking at MS output, the 32 bit field which is the combination of the number_of_relocs and number_of_linenos is used for the line number count in executables. A 16-bit field won't do for cc1. The MS document says that the number of relocs is zero for executables, but the 17-th bit has been observed to be there. Overflow is not an issue: a 4G-line program will overflow a bunch of other fields long before this! */ H_PUT_16 (abfd, (scnhdr_int->s_nlnno & 0xffff), scnhdr_ext->s_nlnno); H_PUT_16 (abfd, (scnhdr_int->s_nlnno >> 16), scnhdr_ext->s_nreloc); } else { if (scnhdr_int->s_nlnno <= 0xffff) H_PUT_16 (abfd, scnhdr_int->s_nlnno, scnhdr_ext->s_nlnno); else { (*_bfd_error_handler) (_("%s: line number overflow: 0x%lx > 0xffff"), bfd_get_filename (abfd), scnhdr_int->s_nlnno); bfd_set_error (bfd_error_file_truncated); H_PUT_16 (abfd, 0xffff, scnhdr_ext->s_nlnno); ret = 0; } /* Although we could encode 0xffff relocs here, we do not, to be consistent with other parts of bfd. Also it lets us warn, as we should never see 0xffff here w/o having the overflow flag set. */ if (scnhdr_int->s_nreloc < 0xffff) H_PUT_16 (abfd, scnhdr_int->s_nreloc, scnhdr_ext->s_nreloc); else { /* PE can deal with large #s of relocs, but not here. */ H_PUT_16 (abfd, 0xffff, scnhdr_ext->s_nreloc); scnhdr_int->s_flags |= IMAGE_SCN_LNK_NRELOC_OVFL; H_PUT_32 (abfd, scnhdr_int->s_flags, scnhdr_ext->s_flags); } } return ret; } static char * dir_names[IMAGE_NUMBEROF_DIRECTORY_ENTRIES] = { N_("Export Directory [.edata (or where ever we found it)]"), N_("Import Directory [parts of .idata]"), N_("Resource Directory [.rsrc]"), N_("Exception Directory [.pdata]"), N_("Security Directory"), N_("Base Relocation Directory [.reloc]"), N_("Debug Directory"), N_("Description Directory"), N_("Special Directory"), N_("Thread Storage Directory [.tls]"), N_("Load Configuration Directory"), N_("Bound Import Directory"), N_("Import Address Table Directory"), N_("Delay Import Directory"), N_("CLR Runtime Header"), N_("Reserved") }; #ifdef POWERPC_LE_PE /* The code for the PPC really falls in the "architecture dependent" category. However, it's not clear that anyone will ever care, so we're ignoring the issue for now; if/when PPC matters, some of this may need to go into peicode.h, or arguments passed to enable the PPC- specific code. */ #endif static bfd_boolean pe_print_idata (bfd * abfd, void * vfile) { FILE *file = (FILE *) vfile; bfd_byte *data; asection *section; bfd_signed_vma adj; #ifdef POWERPC_LE_PE asection *rel_section = bfd_get_section_by_name (abfd, ".reldata"); #endif bfd_size_type datasize = 0; bfd_size_type dataoff; bfd_size_type i; int onaline = 20; pe_data_type *pe = pe_data (abfd); struct internal_extra_pe_aouthdr *extra = &pe->pe_opthdr; bfd_vma addr; addr = extra->DataDirectory[PE_IMPORT_TABLE].VirtualAddress; if (addr == 0 && extra->DataDirectory[PE_IMPORT_TABLE].Size == 0) { /* Maybe the extra header isn't there. Look for the section. */ section = bfd_get_section_by_name (abfd, ".idata"); if (section == NULL) return TRUE; addr = section->vma; datasize = section->size; if (datasize == 0) return TRUE; } else { addr += extra->ImageBase; for (section = abfd->sections; section != NULL; section = section->next) { datasize = section->size; if (addr >= section->vma && addr < section->vma + datasize) break; } if (section == NULL) { fprintf (file, _("\nThere is an import table, but the section containing it could not be found\n")); return TRUE; } } fprintf (file, _("\nThere is an import table in %s at 0x%lx\n"), section->name, (unsigned long) addr); dataoff = addr - section->vma; datasize -= dataoff; #ifdef POWERPC_LE_PE if (rel_section != 0 && rel_section->size != 0) { /* The toc address can be found by taking the starting address, which on the PPC locates a function descriptor. The descriptor consists of the function code starting address followed by the address of the toc. The starting address we get from the bfd, and the descriptor is supposed to be in the .reldata section. */ bfd_vma loadable_toc_address; bfd_vma toc_address; bfd_vma start_address; bfd_byte *data; bfd_vma offset; if (!bfd_malloc_and_get_section (abfd, rel_section, &data)) { if (data != NULL) free (data); return FALSE; } offset = abfd->start_address - rel_section->vma; if (offset >= rel_section->size || offset + 8 > rel_section->size) { if (data != NULL) free (data); return FALSE; } start_address = bfd_get_32 (abfd, data + offset); loadable_toc_address = bfd_get_32 (abfd, data + offset + 4); toc_address = loadable_toc_address - 32768; fprintf (file, _("\nFunction descriptor located at the start address: %04lx\n"), (unsigned long int) (abfd->start_address)); fprintf (file, _("\tcode-base %08lx toc (loadable/actual) %08lx/%08lx\n"), start_address, loadable_toc_address, toc_address); if (data != NULL) free (data); } else { fprintf (file, _("\nNo reldata section! Function descriptor not decoded.\n")); } #endif fprintf (file, _("\nThe Import Tables (interpreted %s section contents)\n"), section->name); fprintf (file, _("\ vma: Hint Time Forward DLL First\n\ Table Stamp Chain Name Thunk\n")); /* Read the whole section. Some of the fields might be before dataoff. */ if (!bfd_malloc_and_get_section (abfd, section, &data)) { if (data != NULL) free (data); return FALSE; } adj = section->vma - extra->ImageBase; /* Print all image import descriptors. */ for (i = 0; i < datasize; i += onaline) { bfd_vma hint_addr; bfd_vma time_stamp; bfd_vma forward_chain; bfd_vma dll_name; bfd_vma first_thunk; int idx = 0; bfd_size_type j; char *dll; /* Print (i + extra->DataDirectory[PE_IMPORT_TABLE].VirtualAddress). */ fprintf (file, " %08lx\t", (unsigned long) (i + adj + dataoff)); hint_addr = bfd_get_32 (abfd, data + i + dataoff); time_stamp = bfd_get_32 (abfd, data + i + 4 + dataoff); forward_chain = bfd_get_32 (abfd, data + i + 8 + dataoff); dll_name = bfd_get_32 (abfd, data + i + 12 + dataoff); first_thunk = bfd_get_32 (abfd, data + i + 16 + dataoff); fprintf (file, "%08lx %08lx %08lx %08lx %08lx\n", (unsigned long) hint_addr, (unsigned long) time_stamp, (unsigned long) forward_chain, (unsigned long) dll_name, (unsigned long) first_thunk); if (hint_addr == 0 && first_thunk == 0) break; if (dll_name - adj >= section->size) break; dll = (char *) data + dll_name - adj; fprintf (file, _("\n\tDLL Name: %s\n"), dll); if (hint_addr != 0) { bfd_byte *ft_data; asection *ft_section; bfd_vma ft_addr; bfd_size_type ft_datasize; int ft_idx; int ft_allocated = 0; fprintf (file, _("\tvma: Hint/Ord Member-Name Bound-To\n")); idx = hint_addr - adj; ft_addr = first_thunk + extra->ImageBase; ft_data = data; ft_idx = first_thunk - adj; ft_allocated = 0; if (first_thunk != hint_addr) { /* Find the section which contains the first thunk. */ for (ft_section = abfd->sections; ft_section != NULL; ft_section = ft_section->next) { ft_datasize = ft_section->size; if (ft_addr >= ft_section->vma && ft_addr < ft_section->vma + ft_datasize) break; } if (ft_section == NULL) { fprintf (file, _("\nThere is a first thunk, but the section containing it could not be found\n")); continue; } /* Now check to see if this section is the same as our current section. If it is not then we will have to load its data in. */ if (ft_section == section) { ft_data = data; ft_idx = first_thunk - adj; } else { ft_idx = first_thunk - (ft_section->vma - extra->ImageBase); ft_data = bfd_malloc (datasize); if (ft_data == NULL) continue; /* Read datasize bfd_bytes starting at offset ft_idx. */ if (! bfd_get_section_contents (abfd, ft_section, ft_data, (bfd_vma) ft_idx, datasize)) { free (ft_data); continue; } ft_idx = 0; ft_allocated = 1; } } /* Print HintName vector entries. */ #ifdef COFF_WITH_pex64 for (j = 0; j < datasize; j += 8) { unsigned long member = bfd_get_32 (abfd, data + idx + j); unsigned long member_high = bfd_get_32 (abfd, data + idx + j + 4); if (!member && !member_high) break; if (member_high & 0x80000000) fprintf (file, "\t%lx%08lx\t %4lx%08lx ", member_high,member, member_high & 0x7fffffff, member); else { int ordinal; char *member_name; ordinal = bfd_get_16 (abfd, data + member - adj); member_name = (char *) data + member - adj + 2; fprintf (file, "\t%04lx\t %4d %s",member, ordinal, member_name); } /* If the time stamp is not zero, the import address table holds actual addresses. */ if (time_stamp != 0 && first_thunk != 0 && first_thunk != hint_addr) fprintf (file, "\t%04lx", (long) bfd_get_32 (abfd, ft_data + ft_idx + j)); fprintf (file, "\n"); } #else for (j = 0; j < datasize; j += 4) { unsigned long member = bfd_get_32 (abfd, data + idx + j); /* Print single IMAGE_IMPORT_BY_NAME vector. */ if (member == 0) break; if (member & 0x80000000) fprintf (file, "\t%04lx\t %4lu ", member, member & 0x7fffffff); else { int ordinal; char *member_name; ordinal = bfd_get_16 (abfd, data + member - adj); member_name = (char *) data + member - adj + 2; fprintf (file, "\t%04lx\t %4d %s", member, ordinal, member_name); } /* If the time stamp is not zero, the import address table holds actual addresses. */ if (time_stamp != 0 && first_thunk != 0 && first_thunk != hint_addr) fprintf (file, "\t%04lx", (long) bfd_get_32 (abfd, ft_data + ft_idx + j)); fprintf (file, "\n"); } #endif if (ft_allocated) free (ft_data); } fprintf (file, "\n"); } free (data); return TRUE; } static bfd_boolean pe_print_edata (bfd * abfd, void * vfile) { FILE *file = (FILE *) vfile; bfd_byte *data; asection *section; bfd_size_type datasize = 0; bfd_size_type dataoff; bfd_size_type i; bfd_signed_vma adj; struct EDT_type { long export_flags; /* Reserved - should be zero. */ long time_stamp; short major_ver; short minor_ver; bfd_vma name; /* RVA - relative to image base. */ long base; /* Ordinal base. */ unsigned long num_functions;/* Number in the export address table. */ unsigned long num_names; /* Number in the name pointer table. */ bfd_vma eat_addr; /* RVA to the export address table. */ bfd_vma npt_addr; /* RVA to the Export Name Pointer Table. */ bfd_vma ot_addr; /* RVA to the Ordinal Table. */ } edt; pe_data_type *pe = pe_data (abfd); struct internal_extra_pe_aouthdr *extra = &pe->pe_opthdr; bfd_vma addr; addr = extra->DataDirectory[PE_EXPORT_TABLE].VirtualAddress; if (addr == 0 && extra->DataDirectory[PE_EXPORT_TABLE].Size == 0) { /* Maybe the extra header isn't there. Look for the section. */ section = bfd_get_section_by_name (abfd, ".edata"); if (section == NULL) return TRUE; addr = section->vma; dataoff = 0; datasize = section->size; if (datasize == 0) return TRUE; } else { addr += extra->ImageBase; for (section = abfd->sections; section != NULL; section = section->next) if (addr >= section->vma && addr < section->vma + section->size) break; if (section == NULL) { fprintf (file, _("\nThere is an export table, but the section containing it could not be found\n")); return TRUE; } dataoff = addr - section->vma; datasize = extra->DataDirectory[PE_EXPORT_TABLE].Size; if (datasize > section->size - dataoff) { fprintf (file, _("\nThere is an export table in %s, but it does not fit into that section\n"), section->name); return TRUE; } + } + + /* PR 17512: Handle corrupt PE binaries. */ + if (datasize < 36) + { + fprintf (file, + _("\nThere is an export table in %s, but it is too small (%d)\n"), + section->name, (int) datasize); + return TRUE; } fprintf (file, _("\nThere is an export table in %s at 0x%lx\n"), section->name, (unsigned long) addr); data = bfd_malloc (datasize); if (data == NULL) return FALSE; if (! bfd_get_section_contents (abfd, section, data, (file_ptr) dataoff, datasize)) return FALSE; /* Go get Export Directory Table. */ edt.export_flags = bfd_get_32 (abfd, data + 0); edt.time_stamp = bfd_get_32 (abfd, data + 4); edt.major_ver = bfd_get_16 (abfd, data + 8); edt.minor_ver = bfd_get_16 (abfd, data + 10); edt.name = bfd_get_32 (abfd, data + 12); edt.base = bfd_get_32 (abfd, data + 16); edt.num_functions = bfd_get_32 (abfd, data + 20); edt.num_names = bfd_get_32 (abfd, data + 24); edt.eat_addr = bfd_get_32 (abfd, data + 28); edt.npt_addr = bfd_get_32 (abfd, data + 32); edt.ot_addr = bfd_get_32 (abfd, data + 36); adj = section->vma - extra->ImageBase + dataoff; /* Dump the EDT first. */ fprintf (file, _("\nThe Export Tables (interpreted %s section contents)\n\n"), section->name); fprintf (file, _("Export Flags \t\t\t%lx\n"), (unsigned long) edt.export_flags); fprintf (file, _("Time/Date stamp \t\t%lx\n"), (unsigned long) edt.time_stamp); fprintf (file, _("Major/Minor \t\t\t%d/%d\n"), edt.major_ver, edt.minor_ver); fprintf (file, _("Name \t\t\t\t")); fprintf_vma (file, edt.name); fprintf (file, " %s\n", data + edt.name - adj); fprintf (file, _("Ordinal Base \t\t\t%ld\n"), edt.base); fprintf (file, _("Number in:\n")); fprintf (file, _("\tExport Address Table \t\t%08lx\n"), edt.num_functions); fprintf (file, _("\t[Name Pointer/Ordinal] Table\t%08lx\n"), edt.num_names); fprintf (file, _("Table Addresses\n")); fprintf (file, _("\tExport Address Table \t\t")); fprintf_vma (file, edt.eat_addr); fprintf (file, "\n"); fprintf (file, _("\tName Pointer Table \t\t")); fprintf_vma (file, edt.npt_addr); fprintf (file, "\n"); fprintf (file, _("\tOrdinal Table \t\t\t")); fprintf_vma (file, edt.ot_addr); fprintf (file, "\n"); /* The next table to find is the Export Address Table. It's basically a list of pointers that either locate a function in this dll, or forward the call to another dll. Something like: typedef union { long export_rva; long forwarder_rva; } export_address_table_entry; */ fprintf (file, _("\nExport Address Table -- Ordinal Base %ld\n"), edt.base); for (i = 0; i < edt.num_functions; ++i) { bfd_vma eat_member = bfd_get_32 (abfd, data + edt.eat_addr + (i * 4) - adj); if (eat_member == 0) continue; if (eat_member - adj <= datasize) { /* This rva is to a name (forwarding function) in our section. */ /* Should locate a function descriptor. */ fprintf (file, "\t[%4ld] +base[%4ld] %04lx %s -- %s\n", (long) i, (long) (i + edt.base), (unsigned long) eat_member, _("Forwarder RVA"), data + eat_member - adj); } else { /* Should locate a function descriptor in the reldata section. */ fprintf (file, "\t[%4ld] +base[%4ld] %04lx %s\n", (long) i, (long) (i + edt.base), (unsigned long) eat_member, _("Export RVA")); } } /* The Export Name Pointer Table is paired with the Export Ordinal Table. */ /* Dump them in parallel for clarity. */ fprintf (file, _("\n[Ordinal/Name Pointer] Table\n")); for (i = 0; i < edt.num_names; ++i) { bfd_vma name_ptr = bfd_get_32 (abfd, data + edt.npt_addr + (i*4) - adj); char *name = (char *) data + name_ptr - adj; bfd_vma ord = bfd_get_16 (abfd, data + edt.ot_addr + (i*2) - adj); fprintf (file, "\t[%4ld] %s\n", (long) ord, name); } free (data); return TRUE; } /* This really is architecture dependent. On IA-64, a .pdata entry consists of three dwords containing relative virtual addresses that specify the start and end address of the code range the entry covers and the address of the corresponding unwind info data. */ static bfd_boolean pe_print_pdata (bfd * abfd, void * vfile) { #if defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) # define PDATA_ROW_SIZE (3 * 8) #else # define PDATA_ROW_SIZE (5 * 4) #endif FILE *file = (FILE *) vfile; bfd_byte *data = 0; asection *section = bfd_get_section_by_name (abfd, ".pdata"); bfd_size_type datasize = 0; bfd_size_type i; bfd_size_type start, stop; int onaline = PDATA_ROW_SIZE; if (section == NULL || coff_section_data (abfd, section) == NULL || pei_section_data (abfd, section) == NULL) return TRUE; stop = pei_section_data (abfd, section)->virt_size; if ((stop % onaline) != 0) fprintf (file, _("Warning, .pdata section size (%ld) is not a multiple of %d\n"), (long) stop, onaline); fprintf (file, _("\nThe Function Table (interpreted .pdata section contents)\n")); #if defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) fprintf (file, _(" vma:\t\t\tBegin Address End Address Unwind Info\n")); #else fprintf (file, _("\ vma:\t\tBegin End EH EH PrologEnd Exception\n\ \t\tAddress Address Handler Data Address Mask\n")); #endif datasize = section->size; if (datasize == 0) return TRUE; if (! bfd_malloc_and_get_section (abfd, section, &data)) { if (data != NULL) free (data); return FALSE; } start = 0; for (i = start; i < stop; i += onaline) { bfd_vma begin_addr; bfd_vma end_addr; bfd_vma eh_handler; bfd_vma eh_data; bfd_vma prolog_end_addr; int em_data; if (i + PDATA_ROW_SIZE > stop) break; begin_addr = GET_PDATA_ENTRY (abfd, data + i ); end_addr = GET_PDATA_ENTRY (abfd, data + i + 4); eh_handler = GET_PDATA_ENTRY (abfd, data + i + 8); eh_data = GET_PDATA_ENTRY (abfd, data + i + 12); prolog_end_addr = GET_PDATA_ENTRY (abfd, data + i + 16); if (begin_addr == 0 && end_addr == 0 && eh_handler == 0 && eh_data == 0 && prolog_end_addr == 0) /* We are probably into the padding of the section now. */ break; em_data = ((eh_handler & 0x1) << 2) | (prolog_end_addr & 0x3); eh_handler &= ~(bfd_vma) 0x3; prolog_end_addr &= ~(bfd_vma) 0x3; fputc (' ', file); fprintf_vma (file, i + section->vma); fputc ('\t', file); fprintf_vma (file, begin_addr); fputc (' ', file); fprintf_vma (file, end_addr); fputc (' ', file); fprintf_vma (file, eh_handler); #if !defined(COFF_WITH_pep) || defined(COFF_WITH_pex64) fputc (' ', file); fprintf_vma (file, eh_data); fputc (' ', file); fprintf_vma (file, prolog_end_addr); fprintf (file, " %x", em_data); #endif #ifdef POWERPC_LE_PE if (eh_handler == 0 && eh_data != 0) { /* Special bits here, although the meaning may be a little mysterious. The only one I know for sure is 0x03 Code Significance 0x00 None 0x01 Register Save Millicode 0x02 Register Restore Millicode 0x03 Glue Code Sequence. */ switch (eh_data) { case 0x01: fprintf (file, _(" Register save millicode")); break; case 0x02: fprintf (file, _(" Register restore millicode")); break; case 0x03: fprintf (file, _(" Glue code sequence")); break; default: break; } } #endif fprintf (file, "\n"); } free (data); return TRUE; } #define IMAGE_REL_BASED_HIGHADJ 4 static const char * const tbl[] = { "ABSOLUTE", "HIGH", "LOW", "HIGHLOW", "HIGHADJ", "MIPS_JMPADDR", "SECTION", "REL32", "RESERVED1", "MIPS_JMPADDR16", "DIR64", "HIGH3ADJ", "UNKNOWN", /* MUST be last. */ }; static bfd_boolean pe_print_reloc (bfd * abfd, void * vfile) { FILE *file = (FILE *) vfile; bfd_byte *data = 0; asection *section = bfd_get_section_by_name (abfd, ".reloc"); bfd_size_type datasize; bfd_size_type i; bfd_size_type start, stop; if (section == NULL) return TRUE; if (section->size == 0) return TRUE; fprintf (file, _("\n\nPE File Base Relocations (interpreted .reloc section contents)\n")); datasize = section->size; if (! bfd_malloc_and_get_section (abfd, section, &data)) { if (data != NULL) free (data); return FALSE; } start = 0; stop = section->size; for (i = start; i < stop;) { int j; bfd_vma virtual_address; long number, size; /* The .reloc section is a sequence of blocks, with a header consisting of two 32 bit quantities, followed by a number of 16 bit entries. */ virtual_address = bfd_get_32 (abfd, data+i); size = bfd_get_32 (abfd, data+i+4); number = (size - 8) / 2; if (size == 0) break; fprintf (file, _("\nVirtual Address: %08lx Chunk size %ld (0x%lx) Number of fixups %ld\n"), (unsigned long) virtual_address, size, size, number); for (j = 0; j < number; ++j) { unsigned short e = bfd_get_16 (abfd, data + i + 8 + j * 2); unsigned int t = (e & 0xF000) >> 12; int off = e & 0x0FFF; if (t >= sizeof (tbl) / sizeof (tbl[0])) t = (sizeof (tbl) / sizeof (tbl[0])) - 1; fprintf (file, _("\treloc %4d offset %4x [%4lx] %s"), j, off, (long) (off + virtual_address), tbl[t]); /* HIGHADJ takes an argument, - the next record *is* the low 16 bits of addend. */ if (t == IMAGE_REL_BASED_HIGHADJ) { fprintf (file, " (%4x)", ((unsigned int) bfd_get_16 (abfd, data + i + 8 + j * 2 + 2))); j++; } fprintf (file, "\n"); } i += size; } free (data); return TRUE; } /* Print out the program headers. */ bfd_boolean _bfd_XX_print_private_bfd_data_common (bfd * abfd, void * vfile) { FILE *file = (FILE *) vfile; int j; pe_data_type *pe = pe_data (abfd); struct internal_extra_pe_aouthdr *i = &pe->pe_opthdr; const char *subsystem_name = NULL; const char *name; /* The MS dumpbin program reportedly ands with 0xff0f before printing the characteristics field. Not sure why. No reason to emulate it here. */ fprintf (file, _("\nCharacteristics 0x%x\n"), pe->real_flags); #undef PF #define PF(x, y) if (pe->real_flags & x) { fprintf (file, "\t%s\n", y); } PF (IMAGE_FILE_RELOCS_STRIPPED, "relocations stripped"); PF (IMAGE_FILE_EXECUTABLE_IMAGE, "executable"); PF (IMAGE_FILE_LINE_NUMS_STRIPPED, "line numbers stripped"); PF (IMAGE_FILE_LOCAL_SYMS_STRIPPED, "symbols stripped"); PF (IMAGE_FILE_LARGE_ADDRESS_AWARE, "large address aware"); PF (IMAGE_FILE_BYTES_REVERSED_LO, "little endian"); PF (IMAGE_FILE_32BIT_MACHINE, "32 bit words"); PF (IMAGE_FILE_DEBUG_STRIPPED, "debugging information removed"); PF (IMAGE_FILE_SYSTEM, "system file"); PF (IMAGE_FILE_DLL, "DLL"); PF (IMAGE_FILE_BYTES_REVERSED_HI, "big endian"); #undef PF /* ctime implies '\n'. */ { time_t t = pe->coff.timestamp; fprintf (file, "\nTime/Date\t\t%s", ctime (&t)); } #ifndef IMAGE_NT_OPTIONAL_HDR_MAGIC # define IMAGE_NT_OPTIONAL_HDR_MAGIC 0x10b #endif #ifndef IMAGE_NT_OPTIONAL_HDR64_MAGIC # define IMAGE_NT_OPTIONAL_HDR64_MAGIC 0x20b #endif #ifndef IMAGE_NT_OPTIONAL_HDRROM_MAGIC # define IMAGE_NT_OPTIONAL_HDRROM_MAGIC 0x107 #endif switch (i->Magic) { case IMAGE_NT_OPTIONAL_HDR_MAGIC: name = "PE32"; break; case IMAGE_NT_OPTIONAL_HDR64_MAGIC: name = "PE32+"; break; case IMAGE_NT_OPTIONAL_HDRROM_MAGIC: name = "ROM"; break; default: name = NULL; break; } fprintf (file, "Magic\t\t\t%04x", i->Magic); if (name) fprintf (file, "\t(%s)",name); fprintf (file, "\nMajorLinkerVersion\t%d\n", i->MajorLinkerVersion); fprintf (file, "MinorLinkerVersion\t%d\n", i->MinorLinkerVersion); fprintf (file, "SizeOfCode\t\t%08lx\n", i->SizeOfCode); fprintf (file, "SizeOfInitializedData\t%08lx\n", i->SizeOfInitializedData); fprintf (file, "SizeOfUninitializedData\t%08lx\n", i->SizeOfUninitializedData); fprintf (file, "AddressOfEntryPoint\t"); fprintf_vma (file, i->AddressOfEntryPoint); fprintf (file, "\nBaseOfCode\t\t"); fprintf_vma (file, i->BaseOfCode); #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64) /* PE32+ does not have BaseOfData member! */ fprintf (file, "\nBaseOfData\t\t"); fprintf_vma (file, i->BaseOfData); #endif fprintf (file, "\nImageBase\t\t"); fprintf_vma (file, i->ImageBase); fprintf (file, "\nSectionAlignment\t"); fprintf_vma (file, i->SectionAlignment); fprintf (file, "\nFileAlignment\t\t"); fprintf_vma (file, i->FileAlignment); fprintf (file, "\nMajorOSystemVersion\t%d\n", i->MajorOperatingSystemVersion); fprintf (file, "MinorOSystemVersion\t%d\n", i->MinorOperatingSystemVersion); fprintf (file, "MajorImageVersion\t%d\n", i->MajorImageVersion); fprintf (file, "MinorImageVersion\t%d\n", i->MinorImageVersion); fprintf (file, "MajorSubsystemVersion\t%d\n", i->MajorSubsystemVersion); fprintf (file, "MinorSubsystemVersion\t%d\n", i->MinorSubsystemVersion); fprintf (file, "Win32Version\t\t%08lx\n", i->Reserved1); fprintf (file, "SizeOfImage\t\t%08lx\n", i->SizeOfImage); fprintf (file, "SizeOfHeaders\t\t%08lx\n", i->SizeOfHeaders); fprintf (file, "CheckSum\t\t%08lx\n", i->CheckSum); switch (i->Subsystem) { case IMAGE_SUBSYSTEM_UNKNOWN: subsystem_name = "unspecified"; break; case IMAGE_SUBSYSTEM_NATIVE: subsystem_name = "NT native"; break; case IMAGE_SUBSYSTEM_WINDOWS_GUI: subsystem_name = "Windows GUI"; break; case IMAGE_SUBSYSTEM_WINDOWS_CUI: subsystem_name = "Windows CUI"; break; case IMAGE_SUBSYSTEM_POSIX_CUI: subsystem_name = "POSIX CUI"; break; case IMAGE_SUBSYSTEM_WINDOWS_CE_GUI: subsystem_name = "Wince CUI"; break; case IMAGE_SUBSYSTEM_EFI_APPLICATION: subsystem_name = "EFI application"; break; case IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER: subsystem_name = "EFI boot service driver"; break; case IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER: subsystem_name = "EFI runtime driver"; break; // These are from revision 8.0 of the MS PE/COFF spec case IMAGE_SUBSYSTEM_EFI_ROM: subsystem_name = "EFI ROM"; break; case IMAGE_SUBSYSTEM_XBOX: subsystem_name = "XBOX"; break; // Added default case for clarity - subsystem_name is NULL anyway. default: subsystem_name = NULL; } fprintf (file, "Subsystem\t\t%08x", i->Subsystem); if (subsystem_name) fprintf (file, "\t(%s)", subsystem_name); fprintf (file, "\nDllCharacteristics\t%08x\n", i->DllCharacteristics); fprintf (file, "SizeOfStackReserve\t"); fprintf_vma (file, i->SizeOfStackReserve); fprintf (file, "\nSizeOfStackCommit\t"); fprintf_vma (file, i->SizeOfStackCommit); fprintf (file, "\nSizeOfHeapReserve\t"); fprintf_vma (file, i->SizeOfHeapReserve); fprintf (file, "\nSizeOfHeapCommit\t"); fprintf_vma (file, i->SizeOfHeapCommit); fprintf (file, "\nLoaderFlags\t\t%08lx\n", i->LoaderFlags); fprintf (file, "NumberOfRvaAndSizes\t%08lx\n", i->NumberOfRvaAndSizes); fprintf (file, "\nThe Data Directory\n"); for (j = 0; j < IMAGE_NUMBEROF_DIRECTORY_ENTRIES; j++) { fprintf (file, "Entry %1x ", j); fprintf_vma (file, i->DataDirectory[j].VirtualAddress); fprintf (file, " %08lx ", i->DataDirectory[j].Size); fprintf (file, "%s\n", dir_names[j]); } pe_print_idata (abfd, vfile); pe_print_edata (abfd, vfile); pe_print_pdata (abfd, vfile); pe_print_reloc (abfd, vfile); return TRUE; } /* Copy any private info we understand from the input bfd to the output bfd. */ bfd_boolean _bfd_XX_bfd_copy_private_bfd_data_common (bfd * ibfd, bfd * obfd) { /* One day we may try to grok other private data. */ if (ibfd->xvec->flavour != bfd_target_coff_flavour || obfd->xvec->flavour != bfd_target_coff_flavour) return TRUE; pe_data (obfd)->pe_opthdr = pe_data (ibfd)->pe_opthdr; pe_data (obfd)->dll = pe_data (ibfd)->dll; /* For strip: if we removed .reloc, we'll make a real mess of things if we don't remove this entry as well. */ if (! pe_data (obfd)->has_reloc_section) { pe_data (obfd)->pe_opthdr.DataDirectory[PE_BASE_RELOCATION_TABLE].VirtualAddress = 0; pe_data (obfd)->pe_opthdr.DataDirectory[PE_BASE_RELOCATION_TABLE].Size = 0; } return TRUE; } /* Copy private section data. */ bfd_boolean _bfd_XX_bfd_copy_private_section_data (bfd *ibfd, asection *isec, bfd *obfd, asection *osec) { if (bfd_get_flavour (ibfd) != bfd_target_coff_flavour || bfd_get_flavour (obfd) != bfd_target_coff_flavour) return TRUE; if (coff_section_data (ibfd, isec) != NULL && pei_section_data (ibfd, isec) != NULL) { if (coff_section_data (obfd, osec) == NULL) { bfd_size_type amt = sizeof (struct coff_section_tdata); osec->used_by_bfd = bfd_zalloc (obfd, amt); if (osec->used_by_bfd == NULL) return FALSE; } if (pei_section_data (obfd, osec) == NULL) { bfd_size_type amt = sizeof (struct pei_section_tdata); coff_section_data (obfd, osec)->tdata = bfd_zalloc (obfd, amt); if (coff_section_data (obfd, osec)->tdata == NULL) return FALSE; } pei_section_data (obfd, osec)->virt_size = pei_section_data (ibfd, isec)->virt_size; pei_section_data (obfd, osec)->pe_flags = pei_section_data (ibfd, isec)->pe_flags; } return TRUE; } void _bfd_XX_get_symbol_info (bfd * abfd, asymbol *symbol, symbol_info *ret) { coff_get_symbol_info (abfd, symbol, ret); } /* Handle the .idata section and other things that need symbol table access. */ bfd_boolean _bfd_XXi_final_link_postscript (bfd * abfd, struct coff_final_link_info *pfinfo) { struct coff_link_hash_entry *h1; struct bfd_link_info *info = pfinfo->info; bfd_boolean result = TRUE; /* There are a few fields that need to be filled in now while we have symbol table access. The .idata subsections aren't directly available as sections, but they are in the symbol table, so get them from there. */ /* The import directory. This is the address of .idata$2, with size of .idata$2 + .idata$3. */ h1 = coff_link_hash_lookup (coff_hash_table (info), ".idata$2", FALSE, FALSE, TRUE); if (h1 != NULL) { /* PR ld/2729: We cannot rely upon all the output sections having been created properly, so check before referencing them. Issue a warning message for any sections tht could not be found. */ if (h1->root.u.def.section != NULL && h1->root.u.def.section->output_section != NULL) pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_TABLE].VirtualAddress = (h1->root.u.def.value + h1->root.u.def.section->output_section->vma + h1->root.u.def.section->output_offset); else { _bfd_error_handler (_("%B: unable to fill in DataDictionary[1] because .idata$2 is missing"), abfd); result = FALSE; } h1 = coff_link_hash_lookup (coff_hash_table (info), ".idata$4", FALSE, FALSE, TRUE); if (h1 != NULL && h1->root.u.def.section != NULL && h1->root.u.def.section->output_section != NULL) pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_TABLE].Size = ((h1->root.u.def.value + h1->root.u.def.section->output_section->vma + h1->root.u.def.section->output_offset) - pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_TABLE].VirtualAddress); else { _bfd_error_handler (_("%B: unable to fill in DataDictionary[1] because .idata$4 is missing"), abfd); result = FALSE; } /* The import address table. This is the size/address of .idata$5. */ h1 = coff_link_hash_lookup (coff_hash_table (info), ".idata$5", FALSE, FALSE, TRUE); if (h1 != NULL && h1->root.u.def.section != NULL && h1->root.u.def.section->output_section != NULL) pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE].VirtualAddress = (h1->root.u.def.value + h1->root.u.def.section->output_section->vma + h1->root.u.def.section->output_offset); else { _bfd_error_handler (_("%B: unable to fill in DataDictionary[12] because .idata$5 is missing"), abfd); result = FALSE; } h1 = coff_link_hash_lookup (coff_hash_table (info), ".idata$6", FALSE, FALSE, TRUE); if (h1 != NULL && h1->root.u.def.section != NULL && h1->root.u.def.section->output_section != NULL) pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE].Size = ((h1->root.u.def.value + h1->root.u.def.section->output_section->vma + h1->root.u.def.section->output_offset) - pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE].VirtualAddress); else { _bfd_error_handler (_("%B: unable to fill in DataDictionary[PE_IMPORT_ADDRESS_TABLE (12)] because .idata$6 is missing"), abfd); result = FALSE; } } h1 = coff_link_hash_lookup (coff_hash_table (info), "__tls_used", FALSE, FALSE, TRUE); if (h1 != NULL) { if (h1->root.u.def.section != NULL && h1->root.u.def.section->output_section != NULL) pe_data (abfd)->pe_opthdr.DataDirectory[PE_TLS_TABLE].VirtualAddress = (h1->root.u.def.value + h1->root.u.def.section->output_section->vma + h1->root.u.def.section->output_offset - pe_data (abfd)->pe_opthdr.ImageBase); else { _bfd_error_handler (_("%B: unable to fill in DataDictionary[9] because __tls_used is missing"), abfd); result = FALSE; } pe_data (abfd)->pe_opthdr.DataDirectory[PE_TLS_TABLE].Size = 0x18; } /* If we couldn't find idata$2, we either have an excessively trivial program or are in DEEP trouble; we have to assume trivial program.... */ return result; } Index: projects/bsd_rdma_4_9/contrib/binutils =================================================================== --- projects/bsd_rdma_4_9/contrib/binutils (revision 326161) +++ projects/bsd_rdma_4_9/contrib/binutils (revision 326162) Property changes on: projects/bsd_rdma_4_9/contrib/binutils ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/binutils:r325505-326161 Index: projects/bsd_rdma_4_9/share/examples/bhyve/vmrun.sh =================================================================== --- projects/bsd_rdma_4_9/share/examples/bhyve/vmrun.sh (revision 326161) +++ projects/bsd_rdma_4_9/share/examples/bhyve/vmrun.sh (revision 326162) @@ -1,371 +1,376 @@ #!/bin/sh # # Copyright (c) 2013 NetApp, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # LOADER=/usr/sbin/bhyveload BHYVECTL=/usr/sbin/bhyvectl FBSDRUN=/usr/sbin/bhyve DEFAULT_MEMSIZE=512M DEFAULT_CPUS=2 DEFAULT_TAPDEV=tap0 DEFAULT_CONSOLE=stdio DEFAULT_VIRTIO_DISK="./diskdev" DEFAULT_ISOFILE="./release.iso" errmsg() { echo "*** $1" } usage() { local msg=$1 - echo "Usage: vmrun.sh [-aEhiTv] [-c ] [-C ] [-d ]" + echo "Usage: vmrun.sh [-aAEhiTv] [-c ] [-C ] [-d ]" echo " [-e ] [-f ] [-F ]" echo " [-g ] [-H ]" echo " [-I ] [-l ]" echo " [-L ]" echo " [-m ] [-P ] [-t ] " echo "" echo " -h: display this help message" echo " -a: force memory mapped local APIC access" + echo " -A: use AHCI disk emulation instead of virtio" echo " -c: number of virtual cpus (default is ${DEFAULT_CPUS})" echo " -C: console device (default is ${DEFAULT_CONSOLE})" echo " -d: virtio diskdev file (default is ${DEFAULT_VIRTIO_DISK})" echo " -e: set FreeBSD loader environment variable" echo " -E: Use UEFI mode" echo " -f: Use a specific UEFI firmware" echo " -F: Use a custom UEFI GOP framebuffer size (default: w=1024,h=768)" echo " -g: listen for connection from kgdb at " echo " -H: host filesystem to export to the loader" echo " -i: force boot of the Installation CDROM image" echo " -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})" echo " -l: the OS loader to use (default is /boot/userboot.so)" echo " -L: IP address for UEFI GOP VNC server (default: 127.0.0.1)" echo " -m: memory size (default is ${DEFAULT_MEMSIZE})" echo " -p: pass-through a host PCI device at bus/slot/func (e.g. 10/0/0)" echo " -P: UEFI GOP VNC port (default: 5900)" echo " -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)" echo " -T: Enable tablet device (for UEFI GOP)" echo " -u: RTC keeps UTC time" echo " -v: Wait for VNC client connection before booting VM" echo " -w: ignore unimplemented MSRs" echo "" [ -n "$msg" ] && errmsg "$msg" exit 1 } if [ `id -u` -ne 0 ]; then errmsg "This script must be executed with superuser privileges" exit 1 fi kldstat -n vmm > /dev/null 2>&1 if [ $? -ne 0 ]; then errmsg "vmm.ko is not loaded" exit 1 fi force_install=0 isofile=${DEFAULT_ISOFILE} memsize=${DEFAULT_MEMSIZE} console=${DEFAULT_CONSOLE} cpus=${DEFAULT_CPUS} tap_total=0 disk_total=0 +disk_emulation="virtio-blk" gdbport=0 loader_opt="" bhyverun_opt="-H -A -P" pass_total=0 # EFI-specific options efi_mode=0 efi_firmware="/usr/local/share/uefi-firmware/BHYVE_UEFI.fd" vncwait="" vnchost="127.0.0.1" vncport=5900 fbsize="w=1024,h=768" tablet="" -while getopts ac:C:d:e:Ef:F:g:hH:iI:l:m:p:P:t:Tuvw c ; do +while getopts aAc:C:d:e:Ef:F:g:hH:iI:l:m:p:P:t:Tuvw c ; do case $c in a) bhyverun_opt="${bhyverun_opt} -a" ;; + A) + disk_emulation="ahci-hd" + ;; c) cpus=${OPTARG} ;; C) console=${OPTARG} ;; d) disk_dev=${OPTARG%%,*} disk_opts=${OPTARG#${disk_dev}} eval "disk_dev${disk_total}=\"${disk_dev}\"" eval "disk_opts${disk_total}=\"${disk_opts}\"" disk_total=$(($disk_total + 1)) ;; e) loader_opt="${loader_opt} -e ${OPTARG}" ;; E) efi_mode=1 ;; f) efi_firmware="${OPTARG}" ;; F) fbsize="${OPTARG}" ;; g) gdbport=${OPTARG} ;; H) host_base=`realpath ${OPTARG}` ;; i) force_install=1 ;; I) isofile=${OPTARG} ;; l) loader_opt="${loader_opt} -l ${OPTARG}" ;; L) vnchost="${OPTARG}" ;; m) memsize=${OPTARG} ;; p) eval "pass_dev${pass_total}=\"${OPTARG}\"" pass_total=$(($pass_total + 1)) ;; P) vncport="${OPTARG}" ;; t) eval "tap_dev${tap_total}=\"${OPTARG}\"" tap_total=$(($tap_total + 1)) ;; T) tablet="-s 30,xhci,tablet" ;; u) bhyverun_opt="${bhyverun_opt} -u" ;; v) vncwait=",wait" ;; w) bhyverun_opt="${bhyverun_opt} -w" ;; *) usage ;; esac done if [ $tap_total -eq 0 ] ; then tap_total=1 tap_dev0="${DEFAULT_TAPDEV}" fi if [ $disk_total -eq 0 ] ; then disk_total=1 disk_dev0="${DEFAULT_VIRTIO_DISK}" fi shift $((${OPTIND} - 1)) if [ $# -ne 1 ]; then usage "virtual machine name not specified" fi vmname="$1" if [ -n "${host_base}" ]; then loader_opt="${loader_opt} -h ${host_base}" fi # If PCI passthru devices are configured then guest memory must be wired if [ ${pass_total} -gt 0 ]; then loader_opt="${loader_opt} -S" bhyverun_opt="${bhyverun_opt} -S" fi if [ ${efi_mode} -gt 0 ]; then if [ ! -f ${efi_firmware} ]; then echo "Error: EFI Firmware ${efi_firmware} doesn't exist. Try: pkg install uefi-edk2-bhyve" exit 1 fi fi make_and_check_diskdev() { local virtio_diskdev="$1" # Create the virtio diskdev file if needed if [ ! -e ${virtio_diskdev} ]; then echo "virtio disk device file \"${virtio_diskdev}\" does not exist." echo "Creating it ..." truncate -s 8G ${virtio_diskdev} > /dev/null fi if [ ! -r ${virtio_diskdev} ]; then echo "virtio disk device file \"${virtio_diskdev}\" is not readable" exit 1 fi if [ ! -w ${virtio_diskdev} ]; then echo "virtio disk device file \"${virtio_diskdev}\" is not writable" exit 1 fi } echo "Launching virtual machine \"$vmname\" ..." first_diskdev="$disk_dev0" ${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1 while [ 1 ]; do file -s ${first_diskdev} | grep "boot sector" > /dev/null rc=$? if [ $rc -ne 0 ]; then file -s ${first_diskdev} | grep ": Unix Fast File sys" > /dev/null rc=$? fi if [ $rc -ne 0 ]; then need_install=1 else need_install=0 fi if [ $force_install -eq 1 -o $need_install -eq 1 ]; then if [ ! -r ${isofile} ]; then echo -n "Installation CDROM image \"${isofile}\" " echo "is not readable" exit 1 fi BOOTDISKS="-d ${isofile}" installer_opt="-s 31:0,ahci-cd,${isofile}" else BOOTDISKS="" i=0 while [ $i -lt $disk_total ] ; do eval "disk=\$disk_dev${i}" if [ -r ${disk} ] ; then BOOTDISKS="$BOOTDISKS -d ${disk} " fi i=$(($i + 1)) done installer_opt="" fi if [ ${efi_mode} -eq 0 ]; then ${LOADER} -c ${console} -m ${memsize} ${BOOTDISKS} ${loader_opt} \ ${vmname} bhyve_exit=$? if [ $bhyve_exit -ne 0 ]; then break fi fi # # Build up args for additional tap and disk devices now. # nextslot=2 # slot 0 is hostbridge, slot 1 is lpc devargs="" # accumulate disk/tap args here i=0 while [ $i -lt $tap_total ] ; do eval "tapname=\$tap_dev${i}" devargs="$devargs -s $nextslot:0,virtio-net,${tapname} " nextslot=$(($nextslot + 1)) i=$(($i + 1)) done i=0 while [ $i -lt $disk_total ] ; do eval "disk=\$disk_dev${i}" eval "opts=\$disk_opts${i}" make_and_check_diskdev "${disk}" - devargs="$devargs -s $nextslot:0,virtio-blk,${disk}${opts} " + devargs="$devargs -s $nextslot:0,$disk_emulation,${disk}${opts} " nextslot=$(($nextslot + 1)) i=$(($i + 1)) done i=0 while [ $i -lt $pass_total ] ; do eval "pass=\$pass_dev${i}" devargs="$devargs -s $nextslot:0,passthru,${pass} " nextslot=$(($nextslot + 1)) i=$(($i + 1)) done efiargs="" if [ ${efi_mode} -gt 0 ]; then efiargs="-s 29,fbuf,tcp=${vnchost}:${vncport},${fbsize}${vncwait}" efiargs="${efiargs} -l bootrom,${efi_firmware}" efiargs="${efiargs} ${tablet}" fi ${FBSDRUN} -c ${cpus} -m ${memsize} ${bhyverun_opt} \ -g ${gdbport} \ -s 0:0,hostbridge \ -s 1:0,lpc \ ${efiargs} \ ${devargs} \ -l com1,${console} \ ${installer_opt} \ ${vmname} bhyve_exit=$? # bhyve returns the following status codes: # 0 - VM has been reset # 1 - VM has been powered off # 2 - VM has been halted # 3 - VM generated a triple fault # all other non-zero status codes are errors # if [ $bhyve_exit -ne 0 ]; then break fi done case $bhyve_exit in 0|1|2) # Cleanup /dev/vmm entry when bhyve did not exit # due to an error. ${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1 ;; esac exit $bhyve_exit Index: projects/bsd_rdma_4_9/stand/common/load_elf.c =================================================================== --- projects/bsd_rdma_4_9/stand/common/load_elf.c (revision 326161) +++ projects/bsd_rdma_4_9/stand/common/load_elf.c (revision 326162) @@ -1,1038 +1,1038 @@ /*- * Copyright (c) 1998 Michael Smith * Copyright (c) 1998 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define FREEBSD_ELF #include #include "bootstrap.h" #define COPYOUT(s,d,l) archsw.arch_copyout((vm_offset_t)(s), d, l) #if defined(__i386__) && __ELF_WORD_SIZE == 64 #undef ELF_TARG_CLASS #undef ELF_TARG_MACH #define ELF_TARG_CLASS ELFCLASS64 #define ELF_TARG_MACH EM_X86_64 #endif typedef struct elf_file { Elf_Phdr *ph; Elf_Ehdr *ehdr; Elf_Sym *symtab; Elf_Hashelt *hashtab; Elf_Hashelt nbuckets; Elf_Hashelt nchains; Elf_Hashelt *buckets; Elf_Hashelt *chains; Elf_Rel *rel; size_t relsz; Elf_Rela *rela; size_t relasz; char *strtab; size_t strsz; int fd; caddr_t firstpage; size_t firstlen; int kernel; u_int64_t off; } *elf_file_t; static int __elfN(loadimage)(struct preloaded_file *mp, elf_file_t ef, u_int64_t loadaddr); static int __elfN(lookup_symbol)(struct preloaded_file *mp, elf_file_t ef, const char* name, Elf_Sym* sym); static int __elfN(reloc_ptr)(struct preloaded_file *mp, elf_file_t ef, Elf_Addr p, void *val, size_t len); static int __elfN(parse_modmetadata)(struct preloaded_file *mp, elf_file_t ef, Elf_Addr p_start, Elf_Addr p_end); static symaddr_fn __elfN(symaddr); static char *fake_modname(const char *name); const char *__elfN(kerneltype) = "elf kernel"; const char *__elfN(moduletype) = "elf module"; u_int64_t __elfN(relocation_offset) = 0; static int __elfN(load_elf_header)(char *filename, elf_file_t ef) { ssize_t bytes_read; Elf_Ehdr *ehdr; int err; /* * Open the image, read and validate the ELF header */ if (filename == NULL) /* can't handle nameless */ return (EFTYPE); if ((ef->fd = open(filename, O_RDONLY)) == -1) return (errno); ef->firstpage = malloc(PAGE_SIZE); if (ef->firstpage == NULL) { close(ef->fd); return (ENOMEM); } bytes_read = read(ef->fd, ef->firstpage, PAGE_SIZE); ef->firstlen = (size_t)bytes_read; if (bytes_read < 0 || ef->firstlen <= sizeof(Elf_Ehdr)) { err = EFTYPE; /* could be EIO, but may be small file */ goto error; } ehdr = ef->ehdr = (Elf_Ehdr *)ef->firstpage; /* Is it ELF? */ if (!IS_ELF(*ehdr)) { err = EFTYPE; goto error; } if (ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || /* Layout ? */ ehdr->e_ident[EI_DATA] != ELF_TARG_DATA || ehdr->e_ident[EI_VERSION] != EV_CURRENT || /* Version ? */ ehdr->e_version != EV_CURRENT || ehdr->e_machine != ELF_TARG_MACH) { /* Machine ? */ err = EFTYPE; goto error; } return (0); error: if (ef->firstpage != NULL) { free(ef->firstpage); ef->firstpage = NULL; } if (ef->fd != -1) { close(ef->fd); ef->fd = -1; } return (err); } /* * Attempt to load the file (file) as an ELF module. It will be stored at * (dest), and a pointer to a module structure describing the loaded object * will be saved in (result). */ int __elfN(loadfile)(char *filename, u_int64_t dest, struct preloaded_file **result) { return (__elfN(loadfile_raw)(filename, dest, result, 0)); } int __elfN(loadfile_raw)(char *filename, u_int64_t dest, struct preloaded_file **result, int multiboot) { struct preloaded_file *fp, *kfp; struct elf_file ef; Elf_Ehdr *ehdr; int err; fp = NULL; bzero(&ef, sizeof(struct elf_file)); ef.fd = -1; err = __elfN(load_elf_header)(filename, &ef); if (err != 0) return (err); ehdr = ef.ehdr; /* * Check to see what sort of module we are. */ kfp = file_findfile(NULL, __elfN(kerneltype)); #ifdef __powerpc__ /* * Kernels can be ET_DYN, so just assume the first loaded object is the * kernel. This assumption will be checked later. */ if (kfp == NULL) ef.kernel = 1; #endif if (ef.kernel || ehdr->e_type == ET_EXEC) { /* Looks like a kernel */ if (kfp != NULL) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: kernel already loaded\n"); err = EPERM; goto oerr; } /* * Calculate destination address based on kernel entrypoint. * * For ARM, the destination address is independent of any values in the * elf header (an ARM kernel can be loaded at any 2MB boundary), so we * leave dest set to the value calculated by archsw.arch_loadaddr() and * passed in to this function. */ #ifndef __arm__ if (ehdr->e_type == ET_EXEC) dest = (ehdr->e_entry & ~PAGE_MASK); #endif if ((ehdr->e_entry & ~PAGE_MASK) == 0) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: not a kernel (maybe static binary?)\n"); err = EPERM; goto oerr; } ef.kernel = 1; } else if (ehdr->e_type == ET_DYN) { /* Looks like a kld module */ if (multiboot != 0) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: can't load module as multiboot\n"); err = EPERM; goto oerr; } if (kfp == NULL) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: can't load module before kernel\n"); err = EPERM; goto oerr; } if (strcmp(__elfN(kerneltype), kfp->f_type)) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: can't load module with kernel type '%s'\n", kfp->f_type); err = EPERM; goto oerr; } /* Looks OK, got ahead */ ef.kernel = 0; } else { err = EFTYPE; goto oerr; } if (archsw.arch_loadaddr != NULL) dest = archsw.arch_loadaddr(LOAD_ELF, ehdr, dest); else dest = roundup(dest, PAGE_SIZE); /* * Ok, we think we should handle this. */ fp = file_alloc(); if (fp == NULL) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: cannot allocate module info\n"); err = EPERM; goto out; } if (ef.kernel == 1 && multiboot == 0) setenv("kernelname", filename, 1); fp->f_name = strdup(filename); if (multiboot == 0) fp->f_type = strdup(ef.kernel ? __elfN(kerneltype) : __elfN(moduletype)); else fp->f_type = strdup("elf multiboot kernel"); #ifdef ELF_VERBOSE if (ef.kernel) printf("%s entry at 0x%jx\n", filename, (uintmax_t)ehdr->e_entry); #else printf("%s ", filename); #endif fp->f_size = __elfN(loadimage)(fp, &ef, dest); if (fp->f_size == 0 || fp->f_addr == 0) goto ioerr; /* save exec header as metadata */ file_addmetadata(fp, MODINFOMD_ELFHDR, sizeof(*ehdr), ehdr); /* Load OK, return module pointer */ *result = (struct preloaded_file *)fp; err = 0; goto out; ioerr: err = EIO; oerr: file_discard(fp); out: if (ef.firstpage) free(ef.firstpage); if (ef.fd != -1) close(ef.fd); return(err); } /* * With the file (fd) open on the image, and (ehdr) containing * the Elf header, load the image at (off) */ static int __elfN(loadimage)(struct preloaded_file *fp, elf_file_t ef, u_int64_t off) { int i; u_int j; Elf_Ehdr *ehdr; Elf_Phdr *phdr, *php; Elf_Shdr *shdr; char *shstr; int ret; vm_offset_t firstaddr; vm_offset_t lastaddr; size_t chunk; ssize_t result; Elf_Addr ssym, esym; Elf_Dyn *dp; Elf_Addr adp; Elf_Addr ctors; int ndp; int symstrindex; int symtabindex; Elf_Size size; u_int fpcopy; Elf_Sym sym; Elf_Addr p_start, p_end; dp = NULL; shdr = NULL; ret = 0; firstaddr = lastaddr = 0; ehdr = ef->ehdr; if (ehdr->e_type == ET_EXEC) { #if defined(__i386__) || defined(__amd64__) #if __ELF_WORD_SIZE == 64 off = - (off & 0xffffffffff000000ull);/* x86_64 relocates after locore */ #else off = - (off & 0xff000000u); /* i386 relocates after locore */ #endif #elif defined(__powerpc__) /* * On the purely virtual memory machines like e500, the kernel is * linked against its final VA range, which is most often not * available at the loader stage, but only after kernel initializes * and completes its VM settings. In such cases we cannot use p_vaddr * field directly to load ELF segments, but put them at some * 'load-time' locations. */ if (off & 0xf0000000u) { off = -(off & 0xf0000000u); /* * XXX the physical load address should not be hardcoded. Note * that the Book-E kernel assumes that it's loaded at a 16MB * boundary for now... */ off += 0x01000000; ehdr->e_entry += off; #ifdef ELF_VERBOSE printf("Converted entry 0x%08x\n", ehdr->e_entry); #endif } else off = 0; #elif defined(__arm__) && !defined(EFI) /* * The elf headers in arm kernels specify virtual addresses in all * header fields, even the ones that should be physical addresses. * We assume the entry point is in the first page, and masking the page * offset will leave us with the virtual address the kernel was linked * at. We subtract that from the load offset, making 'off' into the * value which, when added to a virtual address in an elf header, * translates it to a physical address. We do the va->pa conversion on * the entry point address in the header now, so that later we can * launch the kernel by just jumping to that address. * * When booting from UEFI the copyin and copyout functions handle * adjusting the location relative to the first virtual address. * Because of this there is no need to adjust the offset or entry * point address as these will both be handled by the efi code. */ off -= ehdr->e_entry & ~PAGE_MASK; ehdr->e_entry += off; #ifdef ELF_VERBOSE printf("ehdr->e_entry 0x%08x, va<->pa off %llx\n", ehdr->e_entry, off); #endif #else off = 0; /* other archs use direct mapped kernels */ #endif } ef->off = off; if (ef->kernel) __elfN(relocation_offset) = off; if ((ehdr->e_phoff + ehdr->e_phnum * sizeof(*phdr)) > ef->firstlen) { printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: program header not within first page\n"); goto out; } phdr = (Elf_Phdr *)(ef->firstpage + ehdr->e_phoff); for (i = 0; i < ehdr->e_phnum; i++) { /* We want to load PT_LOAD segments only.. */ if (phdr[i].p_type != PT_LOAD) continue; #ifdef ELF_VERBOSE printf("Segment: 0x%lx@0x%lx -> 0x%lx-0x%lx", (long)phdr[i].p_filesz, (long)phdr[i].p_offset, (long)(phdr[i].p_vaddr + off), (long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1)); #else if ((phdr[i].p_flags & PF_W) == 0) { printf("text=0x%lx ", (long)phdr[i].p_filesz); } else { printf("data=0x%lx", (long)phdr[i].p_filesz); if (phdr[i].p_filesz < phdr[i].p_memsz) printf("+0x%lx", (long)(phdr[i].p_memsz -phdr[i].p_filesz)); printf(" "); } #endif fpcopy = 0; if (ef->firstlen > phdr[i].p_offset) { fpcopy = ef->firstlen - phdr[i].p_offset; archsw.arch_copyin(ef->firstpage + phdr[i].p_offset, phdr[i].p_vaddr + off, fpcopy); } if (phdr[i].p_filesz > fpcopy) { if (kern_pread(ef->fd, phdr[i].p_vaddr + off + fpcopy, phdr[i].p_filesz - fpcopy, phdr[i].p_offset + fpcopy) != 0) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: read failed\n"); goto out; } } /* clear space from oversized segments; eg: bss */ if (phdr[i].p_filesz < phdr[i].p_memsz) { #ifdef ELF_VERBOSE printf(" (bss: 0x%lx-0x%lx)", (long)(phdr[i].p_vaddr + off + phdr[i].p_filesz), (long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1)); #endif kern_bzero(phdr[i].p_vaddr + off + phdr[i].p_filesz, phdr[i].p_memsz - phdr[i].p_filesz); } #ifdef ELF_VERBOSE printf("\n"); #endif if (archsw.arch_loadseg != NULL) archsw.arch_loadseg(ehdr, phdr + i, off); if (firstaddr == 0 || firstaddr > (phdr[i].p_vaddr + off)) firstaddr = phdr[i].p_vaddr + off; if (lastaddr == 0 || lastaddr < (phdr[i].p_vaddr + off + phdr[i].p_memsz)) lastaddr = phdr[i].p_vaddr + off + phdr[i].p_memsz; } lastaddr = roundup(lastaddr, sizeof(long)); /* * Get the section headers. We need this for finding the .ctors * section as well as for loading any symbols. Both may be hard * to do if reading from a .gz file as it involves seeking. I * think the rule is going to have to be that you must strip a * file to remove symbols before gzipping it. */ - chunk = ehdr->e_shnum * ehdr->e_shentsize; + chunk = (size_t)ehdr->e_shnum * (size_t)ehdr->e_shentsize; if (chunk == 0 || ehdr->e_shoff == 0) goto nosyms; shdr = alloc_pread(ef->fd, ehdr->e_shoff, chunk); if (shdr == NULL) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: failed to read section headers"); goto nosyms; } file_addmetadata(fp, MODINFOMD_SHDR, chunk, shdr); /* * Read the section string table and look for the .ctors section. * We need to tell the kernel where it is so that it can call the * ctors. */ chunk = shdr[ehdr->e_shstrndx].sh_size; if (chunk) { shstr = alloc_pread(ef->fd, shdr[ehdr->e_shstrndx].sh_offset, chunk); if (shstr) { for (i = 0; i < ehdr->e_shnum; i++) { if (strcmp(shstr + shdr[i].sh_name, ".ctors") != 0) continue; ctors = shdr[i].sh_addr; file_addmetadata(fp, MODINFOMD_CTORS_ADDR, sizeof(ctors), &ctors); size = shdr[i].sh_size; file_addmetadata(fp, MODINFOMD_CTORS_SIZE, sizeof(size), &size); break; } free(shstr); } } /* * Now load any symbols. */ symtabindex = -1; symstrindex = -1; for (i = 0; i < ehdr->e_shnum; i++) { if (shdr[i].sh_type != SHT_SYMTAB) continue; for (j = 0; j < ehdr->e_phnum; j++) { if (phdr[j].p_type != PT_LOAD) continue; if (shdr[i].sh_offset >= phdr[j].p_offset && (shdr[i].sh_offset + shdr[i].sh_size <= phdr[j].p_offset + phdr[j].p_filesz)) { shdr[i].sh_offset = 0; shdr[i].sh_size = 0; break; } } if (shdr[i].sh_offset == 0 || shdr[i].sh_size == 0) continue; /* alread loaded in a PT_LOAD above */ /* Save it for loading below */ symtabindex = i; symstrindex = shdr[i].sh_link; } if (symtabindex < 0 || symstrindex < 0) goto nosyms; /* Ok, committed to a load. */ #ifndef ELF_VERBOSE printf("syms=["); #endif ssym = lastaddr; for (i = symtabindex; i >= 0; i = symstrindex) { #ifdef ELF_VERBOSE char *secname; switch(shdr[i].sh_type) { case SHT_SYMTAB: /* Symbol table */ secname = "symtab"; break; case SHT_STRTAB: /* String table */ secname = "strtab"; break; default: secname = "WHOA!!"; break; } #endif size = shdr[i].sh_size; archsw.arch_copyin(&size, lastaddr, sizeof(size)); lastaddr += sizeof(size); #ifdef ELF_VERBOSE printf("\n%s: 0x%jx@0x%jx -> 0x%jx-0x%jx", secname, (uintmax_t)shdr[i].sh_size, (uintmax_t)shdr[i].sh_offset, (uintmax_t)lastaddr, (uintmax_t)(lastaddr + shdr[i].sh_size)); #else if (i == symstrindex) printf("+"); printf("0x%lx+0x%lx", (long)sizeof(size), (long)size); #endif if (lseek(ef->fd, (off_t)shdr[i].sh_offset, SEEK_SET) == -1) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: could not seek for symbols - skipped!"); lastaddr = ssym; ssym = 0; goto nosyms; } result = archsw.arch_readin(ef->fd, lastaddr, shdr[i].sh_size); if (result < 0 || (size_t)result != shdr[i].sh_size) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: could not read symbols - skipped! (%ju != %ju)", (uintmax_t)result, (uintmax_t)shdr[i].sh_size); lastaddr = ssym; ssym = 0; goto nosyms; } /* Reset offsets relative to ssym */ lastaddr += shdr[i].sh_size; lastaddr = roundup(lastaddr, sizeof(size)); if (i == symtabindex) symtabindex = -1; else if (i == symstrindex) symstrindex = -1; } esym = lastaddr; #ifndef ELF_VERBOSE printf("]"); #endif file_addmetadata(fp, MODINFOMD_SSYM, sizeof(ssym), &ssym); file_addmetadata(fp, MODINFOMD_ESYM, sizeof(esym), &esym); nosyms: printf("\n"); ret = lastaddr - firstaddr; fp->f_addr = firstaddr; php = NULL; for (i = 0; i < ehdr->e_phnum; i++) { if (phdr[i].p_type == PT_DYNAMIC) { php = phdr + i; adp = php->p_vaddr; file_addmetadata(fp, MODINFOMD_DYNAMIC, sizeof(adp), &adp); break; } } if (php == NULL) /* this is bad, we cannot get to symbols or _DYNAMIC */ goto out; ndp = php->p_filesz / sizeof(Elf_Dyn); if (ndp == 0) goto out; dp = malloc(php->p_filesz); if (dp == NULL) goto out; archsw.arch_copyout(php->p_vaddr + off, dp, php->p_filesz); ef->strsz = 0; for (i = 0; i < ndp; i++) { if (dp[i].d_tag == 0) break; switch (dp[i].d_tag) { case DT_HASH: ef->hashtab = (Elf_Hashelt*)(uintptr_t)(dp[i].d_un.d_ptr + off); break; case DT_STRTAB: ef->strtab = (char *)(uintptr_t)(dp[i].d_un.d_ptr + off); break; case DT_STRSZ: ef->strsz = dp[i].d_un.d_val; break; case DT_SYMTAB: ef->symtab = (Elf_Sym*)(uintptr_t)(dp[i].d_un.d_ptr + off); break; case DT_REL: ef->rel = (Elf_Rel *)(uintptr_t)(dp[i].d_un.d_ptr + off); break; case DT_RELSZ: ef->relsz = dp[i].d_un.d_val; break; case DT_RELA: ef->rela = (Elf_Rela *)(uintptr_t)(dp[i].d_un.d_ptr + off); break; case DT_RELASZ: ef->relasz = dp[i].d_un.d_val; break; default: break; } } if (ef->hashtab == NULL || ef->symtab == NULL || ef->strtab == NULL || ef->strsz == 0) goto out; COPYOUT(ef->hashtab, &ef->nbuckets, sizeof(ef->nbuckets)); COPYOUT(ef->hashtab + 1, &ef->nchains, sizeof(ef->nchains)); ef->buckets = ef->hashtab + 2; ef->chains = ef->buckets + ef->nbuckets; if (__elfN(lookup_symbol)(fp, ef, "__start_set_modmetadata_set", &sym) != 0) return 0; p_start = sym.st_value + ef->off; if (__elfN(lookup_symbol)(fp, ef, "__stop_set_modmetadata_set", &sym) != 0) return ENOENT; p_end = sym.st_value + ef->off; if (__elfN(parse_modmetadata)(fp, ef, p_start, p_end) == 0) goto out; if (ef->kernel) /* kernel must not depend on anything */ goto out; out: if (dp) free(dp); if (shdr) free(shdr); return ret; } static char invalid_name[] = "bad"; char * fake_modname(const char *name) { const char *sp, *ep; char *fp; size_t len; sp = strrchr(name, '/'); if (sp) sp++; else sp = name; ep = strrchr(name, '.'); if (ep) { if (ep == name) { sp = invalid_name; ep = invalid_name + sizeof(invalid_name) - 1; } } else ep = name + strlen(name); len = ep - sp; fp = malloc(len + 1); if (fp == NULL) return NULL; memcpy(fp, sp, len); fp[len] = '\0'; return fp; } #if (defined(__i386__) || defined(__powerpc__)) && __ELF_WORD_SIZE == 64 struct mod_metadata64 { int md_version; /* structure version MDTV_* */ int md_type; /* type of entry MDT_* */ u_int64_t md_data; /* specific data */ u_int64_t md_cval; /* common string label */ }; #endif #if defined(__amd64__) && __ELF_WORD_SIZE == 32 struct mod_metadata32 { int md_version; /* structure version MDTV_* */ int md_type; /* type of entry MDT_* */ u_int32_t md_data; /* specific data */ u_int32_t md_cval; /* common string label */ }; #endif int __elfN(load_modmetadata)(struct preloaded_file *fp, u_int64_t dest) { struct elf_file ef; int err, i, j; Elf_Shdr *sh_meta, *shdr = NULL; Elf_Shdr *sh_data[2]; char *shstrtab = NULL; size_t size; Elf_Addr p_start, p_end; bzero(&ef, sizeof(struct elf_file)); ef.fd = -1; err = __elfN(load_elf_header)(fp->f_name, &ef); if (err != 0) goto out; if (ef.kernel == 1 || ef.ehdr->e_type == ET_EXEC) { ef.kernel = 1; } else if (ef.ehdr->e_type != ET_DYN) { err = EFTYPE; goto out; } - size = ef.ehdr->e_shnum * ef.ehdr->e_shentsize; + size = (size_t)ef.ehdr->e_shnum * (size_t)ef.ehdr->e_shentsize; shdr = alloc_pread(ef.fd, ef.ehdr->e_shoff, size); if (shdr == NULL) { err = ENOMEM; goto out; } /* Load shstrtab. */ shstrtab = alloc_pread(ef.fd, shdr[ef.ehdr->e_shstrndx].sh_offset, shdr[ef.ehdr->e_shstrndx].sh_size); if (shstrtab == NULL) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "load_modmetadata: unable to load shstrtab\n"); err = EFTYPE; goto out; } /* Find set_modmetadata_set and data sections. */ sh_data[0] = sh_data[1] = sh_meta = NULL; for (i = 0, j = 0; i < ef.ehdr->e_shnum; i++) { if (strcmp(&shstrtab[shdr[i].sh_name], "set_modmetadata_set") == 0) { sh_meta = &shdr[i]; } if ((strcmp(&shstrtab[shdr[i].sh_name], ".data") == 0) || (strcmp(&shstrtab[shdr[i].sh_name], ".rodata") == 0)) { sh_data[j++] = &shdr[i]; } } if (sh_meta == NULL || sh_data[0] == NULL || sh_data[1] == NULL) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "load_modmetadata: unable to find set_modmetadata_set or data sections\n"); err = EFTYPE; goto out; } /* Load set_modmetadata_set into memory */ err = kern_pread(ef.fd, dest, sh_meta->sh_size, sh_meta->sh_offset); if (err != 0) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "load_modmetadata: unable to load set_modmetadata_set: %d\n", err); goto out; } p_start = dest; p_end = dest + sh_meta->sh_size; dest += sh_meta->sh_size; /* Load data sections into memory. */ err = kern_pread(ef.fd, dest, sh_data[0]->sh_size, sh_data[0]->sh_offset); if (err != 0) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "load_modmetadata: unable to load data: %d\n", err); goto out; } /* * We have to increment the dest, so that the offset is the same into * both the .rodata and .data sections. */ ef.off = -(sh_data[0]->sh_addr - dest); dest += (sh_data[1]->sh_addr - sh_data[0]->sh_addr); err = kern_pread(ef.fd, dest, sh_data[1]->sh_size, sh_data[1]->sh_offset); if (err != 0) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "load_modmetadata: unable to load data: %d\n", err); goto out; } err = __elfN(parse_modmetadata)(fp, &ef, p_start, p_end); if (err != 0) { printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "load_modmetadata: unable to parse metadata: %d\n", err); goto out; } out: if (shstrtab != NULL) free(shstrtab); if (shdr != NULL) free(shdr); if (ef.firstpage != NULL) free(ef.firstpage); if (ef.fd != -1) close(ef.fd); return (err); } int __elfN(parse_modmetadata)(struct preloaded_file *fp, elf_file_t ef, Elf_Addr p_start, Elf_Addr p_end) { struct mod_metadata md; #if (defined(__i386__) || defined(__powerpc__)) && __ELF_WORD_SIZE == 64 struct mod_metadata64 md64; #elif defined(__amd64__) && __ELF_WORD_SIZE == 32 struct mod_metadata32 md32; #endif struct mod_depend *mdepend; struct mod_version mver; char *s; int error, modcnt, minfolen; Elf_Addr v, p; modcnt = 0; p = p_start; while (p < p_end) { COPYOUT(p, &v, sizeof(v)); error = __elfN(reloc_ptr)(fp, ef, p, &v, sizeof(v)); if (error == EOPNOTSUPP) v += ef->off; else if (error != 0) return (error); #if (defined(__i386__) || defined(__powerpc__)) && __ELF_WORD_SIZE == 64 COPYOUT(v, &md64, sizeof(md64)); error = __elfN(reloc_ptr)(fp, ef, v, &md64, sizeof(md64)); if (error == EOPNOTSUPP) { md64.md_cval += ef->off; md64.md_data += ef->off; } else if (error != 0) return (error); md.md_version = md64.md_version; md.md_type = md64.md_type; md.md_cval = (const char *)(uintptr_t)md64.md_cval; md.md_data = (void *)(uintptr_t)md64.md_data; #elif defined(__amd64__) && __ELF_WORD_SIZE == 32 COPYOUT(v, &md32, sizeof(md32)); error = __elfN(reloc_ptr)(fp, ef, v, &md32, sizeof(md32)); if (error == EOPNOTSUPP) { md32.md_cval += ef->off; md32.md_data += ef->off; } else if (error != 0) return (error); md.md_version = md32.md_version; md.md_type = md32.md_type; md.md_cval = (const char *)(uintptr_t)md32.md_cval; md.md_data = (void *)(uintptr_t)md32.md_data; #else COPYOUT(v, &md, sizeof(md)); error = __elfN(reloc_ptr)(fp, ef, v, &md, sizeof(md)); if (error == EOPNOTSUPP) { md.md_cval += ef->off; md.md_data = (void *)((uintptr_t)md.md_data + (uintptr_t)ef->off); } else if (error != 0) return (error); #endif p += sizeof(Elf_Addr); switch(md.md_type) { case MDT_DEPEND: if (ef->kernel) /* kernel must not depend on anything */ break; s = strdupout((vm_offset_t)md.md_cval); minfolen = sizeof(*mdepend) + strlen(s) + 1; mdepend = malloc(minfolen); if (mdepend == NULL) return ENOMEM; COPYOUT((vm_offset_t)md.md_data, mdepend, sizeof(*mdepend)); strcpy((char*)(mdepend + 1), s); free(s); file_addmetadata(fp, MODINFOMD_DEPLIST, minfolen, mdepend); free(mdepend); break; case MDT_VERSION: s = strdupout((vm_offset_t)md.md_cval); COPYOUT((vm_offset_t)md.md_data, &mver, sizeof(mver)); file_addmodule(fp, s, mver.mv_version, NULL); free(s); modcnt++; break; } } if (modcnt == 0) { s = fake_modname(fp->f_name); file_addmodule(fp, s, 1, NULL); free(s); } return 0; } static unsigned long elf_hash(const char *name) { const unsigned char *p = (const unsigned char *) name; unsigned long h = 0; unsigned long g; while (*p != '\0') { h = (h << 4) + *p++; if ((g = h & 0xf0000000) != 0) h ^= g >> 24; h &= ~g; } return h; } static const char __elfN(bad_symtable)[] = "elf" __XSTRING(__ELF_WORD_SIZE) "_lookup_symbol: corrupt symbol table\n"; int __elfN(lookup_symbol)(struct preloaded_file *fp, elf_file_t ef, const char* name, Elf_Sym *symp) { Elf_Hashelt symnum; Elf_Sym sym; char *strp; unsigned long hash; hash = elf_hash(name); COPYOUT(&ef->buckets[hash % ef->nbuckets], &symnum, sizeof(symnum)); while (symnum != STN_UNDEF) { if (symnum >= ef->nchains) { printf(__elfN(bad_symtable)); return ENOENT; } COPYOUT(ef->symtab + symnum, &sym, sizeof(sym)); if (sym.st_name == 0) { printf(__elfN(bad_symtable)); return ENOENT; } strp = strdupout((vm_offset_t)(ef->strtab + sym.st_name)); if (strcmp(name, strp) == 0) { free(strp); if (sym.st_shndx != SHN_UNDEF || (sym.st_value != 0 && ELF_ST_TYPE(sym.st_info) == STT_FUNC)) { *symp = sym; return 0; } return ENOENT; } free(strp); COPYOUT(&ef->chains[symnum], &symnum, sizeof(symnum)); } return ENOENT; } /* * Apply any intra-module relocations to the value. p is the load address * of the value and val/len is the value to be modified. This does NOT modify * the image in-place, because this is done by kern_linker later on. * * Returns EOPNOTSUPP if no relocation method is supplied. */ static int __elfN(reloc_ptr)(struct preloaded_file *mp, elf_file_t ef, Elf_Addr p, void *val, size_t len) { size_t n; Elf_Rela a; Elf_Rel r; int error; /* * The kernel is already relocated, but we still want to apply * offset adjustments. */ if (ef->kernel) return (EOPNOTSUPP); for (n = 0; n < ef->relsz / sizeof(r); n++) { COPYOUT(ef->rel + n, &r, sizeof(r)); error = __elfN(reloc)(ef, __elfN(symaddr), &r, ELF_RELOC_REL, ef->off, p, val, len); if (error != 0) return (error); } for (n = 0; n < ef->relasz / sizeof(a); n++) { COPYOUT(ef->rela + n, &a, sizeof(a)); error = __elfN(reloc)(ef, __elfN(symaddr), &a, ELF_RELOC_RELA, ef->off, p, val, len); if (error != 0) return (error); } return (0); } static Elf_Addr __elfN(symaddr)(struct elf_file *ef, Elf_Size symidx) { /* Symbol lookup by index not required here. */ return (0); } Index: projects/bsd_rdma_4_9/stand/userboot/test/test.c =================================================================== --- projects/bsd_rdma_4_9/stand/userboot/test/test.c (revision 326161) +++ projects/bsd_rdma_4_9/stand/userboot/test/test.c (revision 326162) @@ -1,475 +1,475 @@ /*- * Copyright (c) 2011 Google, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *host_base = NULL; struct termios term, oldterm; char *image; size_t image_size; int disk_fd = -1; uint64_t regs[16]; uint64_t pc; void test_exit(void *arg, int v); /* * Console i/o */ void test_putc(void *arg, int ch) { char c = ch; write(1, &c, 1); } int test_getc(void *arg) { char c; if (read(0, &c, 1) == 1) return c; return -1; } int test_poll(void *arg) { int n; if (ioctl(0, FIONREAD, &n) >= 0) return (n > 0); return (0); } /* * Host filesystem i/o */ struct test_file { int tf_isdir; size_t tf_size; struct stat tf_stat; union { int fd; DIR *dir; } tf_u; }; int test_open(void *arg, const char *filename, void **h_return) { struct stat st; struct test_file *tf; char path[PATH_MAX]; if (!host_base) return (ENOENT); strlcpy(path, host_base, PATH_MAX); if (path[strlen(path) - 1] == '/') path[strlen(path) - 1] = 0; strlcat(path, filename, PATH_MAX); tf = malloc(sizeof(struct test_file)); if (stat(path, &tf->tf_stat) < 0) { free(tf); return (errno); } tf->tf_size = st.st_size; if (S_ISDIR(tf->tf_stat.st_mode)) { tf->tf_isdir = 1; tf->tf_u.dir = opendir(path); if (!tf->tf_u.dir) goto out; *h_return = tf; return (0); } if (S_ISREG(tf->tf_stat.st_mode)) { tf->tf_isdir = 0; tf->tf_u.fd = open(path, O_RDONLY); if (tf->tf_u.fd < 0) goto out; *h_return = tf; return (0); } out: free(tf); return (EINVAL); } int test_close(void *arg, void *h) { struct test_file *tf = h; if (tf->tf_isdir) closedir(tf->tf_u.dir); else close(tf->tf_u.fd); free(tf); return (0); } int test_isdir(void *arg, void *h) { struct test_file *tf = h; return (tf->tf_isdir); } int test_read(void *arg, void *h, void *dst, size_t size, size_t *resid_return) { struct test_file *tf = h; ssize_t sz; if (tf->tf_isdir) return (EINVAL); sz = read(tf->tf_u.fd, dst, size); if (sz < 0) return (EINVAL); *resid_return = size - sz; return (0); } int test_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return, size_t *namelen_return, char *name) { struct test_file *tf = h; struct dirent *dp; if (!tf->tf_isdir) return (EINVAL); dp = readdir(tf->tf_u.dir); if (!dp) return (ENOENT); /* * Note: d_namlen is in the range 0..255 and therefore less * than PATH_MAX so we don't need to test before copying. */ *fileno_return = dp->d_fileno; *type_return = dp->d_type; *namelen_return = dp->d_namlen; memcpy(name, dp->d_name, dp->d_namlen); name[dp->d_namlen] = 0; return (0); } int test_seek(void *arg, void *h, uint64_t offset, int whence) { struct test_file *tf = h; if (tf->tf_isdir) return (EINVAL); if (lseek(tf->tf_u.fd, offset, whence) < 0) return (errno); return (0); } int test_stat(void *arg, void *h, int *mode_return, int *uid_return, int *gid_return, uint64_t *size_return) { struct test_file *tf = h; *mode_return = tf->tf_stat.st_mode; *uid_return = tf->tf_stat.st_uid; *gid_return = tf->tf_stat.st_gid; *size_return = tf->tf_stat.st_size; return (0); } /* * Disk image i/o */ int test_diskread(void *arg, int unit, uint64_t offset, void *dst, size_t size, size_t *resid_return) { ssize_t n; if (unit != 0 || disk_fd == -1) return (EIO); n = pread(disk_fd, dst, size, offset); if (n < 0) return (errno); *resid_return = size - n; return (0); } int test_diskioctl(void *arg, int unit, u_long cmd, void *data) { struct stat sb; if (unit != 0 || disk_fd == -1) return (EBADF); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = 512; break; case DIOCGMEDIASIZE: if (fstat(disk_fd, &sb) == 0) *(off_t *)data = sb.st_size; else return (ENOTTY); break; default: return (ENOTTY); } return (0); } /* * Guest virtual machine i/o * * Note: guest addresses are kernel virtual */ int test_copyin(void *arg, const void *from, uint64_t to, size_t size) { to &= 0x7fffffff; if (to > image_size) return (EFAULT); if (to + size > image_size) size = image_size - to; memcpy(&image[to], from, size); return(0); } int test_copyout(void *arg, uint64_t from, void *to, size_t size) { from &= 0x7fffffff; if (from > image_size) return (EFAULT); if (from + size > image_size) size = image_size - from; memcpy(to, &image[from], size); return(0); } void test_setreg(void *arg, int r, uint64_t v) { if (r < 0 || r >= 16) return; regs[r] = v; } void test_setmsr(void *arg, int r, uint64_t v) { } void test_setcr(void *arg, int r, uint64_t v) { } void test_setgdt(void *arg, uint64_t v, size_t sz) { } void test_exec(void *arg, uint64_t pc) { printf("Execute at 0x%"PRIu64"\n", pc); test_exit(arg, 0); } /* * Misc */ void test_delay(void *arg, int usec) { usleep(usec); } void test_exit(void *arg, int v) { tcsetattr(0, TCSAFLUSH, &oldterm); exit(v); } void test_getmem(void *arg, uint64_t *lowmem, uint64_t *highmem) { *lowmem = 128*1024*1024; *highmem = 0; } const char * test_getenv(void *arg, int idx) { static const char *vars[] = { "foo=bar", "bar=barbar", NULL }; return (vars[idx]); } struct loader_callbacks cb = { .putc = test_putc, .getc = test_getc, .poll = test_poll, .open = test_open, .close = test_close, .isdir = test_isdir, .read = test_read, .readdir = test_readdir, .seek = test_seek, .stat = test_stat, .diskread = test_diskread, .diskioctl = test_diskioctl, .copyin = test_copyin, .copyout = test_copyout, .setreg = test_setreg, .setmsr = test_setmsr, .setcr = test_setcr, .setgdt = test_setgdt, .exec = test_exec, .delay = test_delay, .exit = test_exit, .getmem = test_getmem, .getenv = test_getenv, }; void usage() { printf("usage: [-b ] [-d ] [-h \n"); exit(1); } int main(int argc, char** argv) { void *h; - void (*func)(struct loader_callbacks *, void *, int, int); + void (*func)(struct loader_callbacks *, void *, int, int) __dead2; int opt; char *disk_image = NULL; const char *userboot_obj = "/boot/userboot.so"; while ((opt = getopt(argc, argv, "b:d:h:")) != -1) { switch (opt) { case 'b': userboot_obj = optarg; break; case 'd': disk_image = optarg; break; case 'h': host_base = optarg; break; case '?': usage(); } } h = dlopen(userboot_obj, RTLD_LOCAL); if (!h) { printf("%s\n", dlerror()); return (1); } func = dlsym(h, "loader_main"); if (!func) { printf("%s\n", dlerror()); return (1); } image_size = 128*1024*1024; image = malloc(image_size); if (disk_image) { disk_fd = open(disk_image, O_RDONLY); if (disk_fd < 0) err(1, "Can't open disk image '%s'", disk_image); } tcgetattr(0, &term); oldterm = term; term.c_iflag &= ~(ICRNL); term.c_lflag &= ~(ICANON|ECHO); tcsetattr(0, TCSAFLUSH, &term); func(&cb, NULL, USERBOOT_VERSION_3, disk_fd >= 0); } Index: projects/bsd_rdma_4_9/sys/amd64/amd64/machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/amd64/amd64/machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/amd64/amd64/machdep.c (revision 326162) @@ -1,2570 +1,2569 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern u_int64_t hammer_time(u_int64_t, u_int64_t); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup(void *); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = i8254_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, #ifdef SMP .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif .msi_init = msi_init, }; /* * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is * the physical address at which the kernel is loaded. */ extern char kernphys[]; struct msgbuf *msgbufp; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; struct region_descriptor r_gdt, r_idt; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_cnt.v_free_count), ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); update_pcb_bases(pcb); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(rflags, regs->tf_rflags)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); update_pcb_bases(pcb); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); update_pcb_bases(pcb); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; - td->td_retval[1] = 0; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); struct amd64tss common_tss[MAXCPU]; /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; void setidt(int idx, inthand_t *func, int typ, int dpl, int ist) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016lx\n", rxcr(0)); db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%016lx\n", rdr0()); db_printf("dr1\t0x%016lx\n", rdr1()); db_printf("dr2\t0x%016lx\n", rdr2()); db_printf("dr3\t0x%016lx\n", rdr3()); db_printf("dr6\t0x%016lx\n", rdr6()); db_printf("dr7\t0x%016lx\n", rdr7()); } #endif void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } #if !defined(DEV_ATPIC) && defined(DEV_ISA) #include #include /* * Return a bitmap of the current interrupt requests. This is 8259-specific * and is only suitable for use at probe time. * This is only here to pacify sio. It is NOT FATAL if this doesn't work. * It shouldn't be here. There should probably be an APIC centric * implementation in the apic driver code, if at all. */ intrmask_t isa_irq_pending(void) { u_char irr1; u_char irr2; irr1 = inb(IO_ICU1); irr2 = inb(IO_ICU2); return ((irr2 << 8) | irr1); } #endif u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. * * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idx)) break; } } static char bootmethod[16] = ""; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; int page_counter; bzero(physmap, sizeof(physmap)); physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); physmap_idx -= 2; /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] <= 0xA0000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0 || basemem > 640) { if (bootverbose) printf( "Memory map doesn't contain a basemem segment, faking it"); basemem = 640; } /* * Make hole for "AP -> long mode" bootstrap code. The * mp_bootaddress vector is only available when the kernel * is configured to support APs and APs for the system start * in 32bit mode (e.g. SMP bare metal). */ if (init_ops.mp_bootaddress) { if (physmap[1] >= 0x100000000) panic( "Basemem segment is not suitable for AP bootstrap code!"); physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024); } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmap[0] < physmem_start) { if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); } pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ page_counter = 0; if (memtest != 0) printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * Print a "." every GB to show we're making * progress. */ page_counter++; if ((page_counter % PAGES_PER_GB) == 0) printf("."); /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); if (memtest != 0) printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; char *envp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += KERNBASE; init_static_kenv(envp, 0); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); return (kmdp); } static void amd64_kdb_init(void) { kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; struct nmi_pcpu *np; struct xstate_hdr *xhdr; u_int64_t msr; char *env; size_t kstack0_sz; int late_console; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); kmdp = init_ops.parse_preload_data(modulep); identify_cpu(); identify_hypervisor(); /* Init basic tunables, hz etc */ init_param1(); thread0.td_kstack = physfree + KERNBASE; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long) gdt; lgdt(&r_gdt); pc = &__pcpu[0]; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ pcpu_init(pc, 0, sizeof(struct pcpu)); dpcpu_init((void *)(physfree + KERNBASE), 0); physfree += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); /* Non-late cninit() and printf() can be moved up to here. */ PCPU_SET(tssp, &common_tss[0]); PCPU_SET(commontssp, &common_tss[0]); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). * Once bootblocks have updated, we can test directly for * efi_systbl != NULL here... */ if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL) vty_set_preferred(VTY_VT); finishidentcpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* doublefault stack space, runs on ist1 */ common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t) pc; common_tss[0].tss_ist2 = (long) np; /* Set the IO permission bitmap (empty due to tss seg limit) */ common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* Set up the fast syscall stuff */ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); /* * Temporary forge some valid pointer to PCB, for exception * handlers. It is reinitialized properly below after FPU is * set up. Also set up td_critnest to short-cut the page * fault handler. */ cpu_max_ext_state_size = sizeof(struct savefpu); thread0.td_pcb = get_pcb_td(&thread0); thread0.td_critnest = 1; /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); amd64_kdb_init(); } getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ if (late_console) cninit(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?"; #endif if (late_console) amd64_kdb_init(); msgbufinit(msgbufp, msgbufsize); fpuinit(); /* * Set up thread0 pcb after fpuinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb; /* Ensure the stack is aligned to 16 bytes */ common_tss[0].tss_rsp0 &= ~0xFul; PCPU_SET(rsp0, common_tss[0].tss_rsp0); PCPU_SET(curpcb, thread0.td_pcb); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); cpu_probe_amdc1e(); #ifdef FDT x86_init_fdt(); #endif thread0.td_critnest = 0; /* Location of kernel stack for locore */ return ((u_int64_t)thread0.td_pcb); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_rflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_rflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); update_pcb_bases(pcb); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } set_pcb_flags(pcb, PCB_FULL_IRET); if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(struct savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; set_pcb_flags(pcb, PCB_DBREGS); } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } /* * The pcb_flags is only modified by current thread, or by other threads * when current thread is stopped. However, current thread may change it * from the interrupt context in cpu_switch(), or in the trap handler. * When we read-modify-write pcb_flags from C sources, compiler may generate * code that is not atomic regarding the interrupt handler. If a trap or * interrupt happens and any flag is modified from the handler, it can be * clobbered with the cached value later. Therefore, we implement setting * and clearing flags with single-instruction functions, which do not race * with possible modification of the flags from the trap or interrupt context, * because traps and interrupts are executed only on instruction boundary. */ void set_pcb_flags_raw(struct pcb *pcb, const u_int flags) { __asm __volatile("orl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) : "cc", "memory"); } /* * The support for RDFSBASE, WRFSBASE and similar instructions for %gs * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into * pcb if user space modified the bases. We must save on the context * switch or if the return to usermode happens through the doreti. * * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, * which have a consequence that the base MSRs must be saved each time * the PCB_FULL_IRET flag is set. We disable interrupts to sync with * context switches. */ void set_pcb_flags(struct pcb *pcb, const u_int flags) { register_t r; if (curpcb == pcb && (flags & PCB_FULL_IRET) != 0 && (pcb->pcb_flags & PCB_FULL_IRET) == 0 && (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { r = intr_disable(); if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { if (rfs() == _ufssel) pcb->pcb_fsbase = rdfsbase(); if (rgs() == _ugssel) pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); } set_pcb_flags_raw(pcb, flags); intr_restore(r); } else { set_pcb_flags_raw(pcb, flags); } } void clear_pcb_flags(struct pcb *pcb, const u_int flags) { __asm __volatile("andl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) : "cc", "memory"); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ Index: projects/bsd_rdma_4_9/sys/amd64/ia32/ia32_signal.c =================================================================== --- projects/bsd_rdma_4_9/sys/amd64/ia32/ia32_signal.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/amd64/ia32/ia32_signal.c (revision 326162) @@ -1,971 +1,970 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2003 Peter Wemm * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD4 static void freebsd4_ia32_sendsig(sig_t, ksiginfo_t *, sigset_t *); #endif #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void ia32_get_fpcontext(struct thread *td, struct ia32_mcontext *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; /* * XXX Format of 64bit and 32bit FXSAVE areas differs. FXSAVE * in 32bit mode saves %cs and %ds, while on 64bit it saves * 64bit instruction and data pointers. Ignore the difference * for now, it should be irrelevant for most applications. */ mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(struct savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_IA32_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int ia32_set_fpcontext(struct thread *td, struct ia32_mcontext *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } /* * Get machine context. */ static int ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); /* Entry into kernel always sets TF_HASSEGS */ mcp->mc_gs = tp->tf_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_rdi; mcp->mc_esi = tp->tf_rsi; mcp->mc_ebp = tp->tf_rbp; mcp->mc_isp = tp->tf_rsp; mcp->mc_eflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_rax; mcp->mc_edx = tp->tf_rdx; } mcp->mc_ebx = tp->tf_rbx; mcp->mc_ecx = tp->tf_rcx; mcp->mc_eip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); mcp->mc_flags = tp->tf_flags; ia32_get_fpcontext(td, mcp, NULL, 0); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ static int ia32_set_mcontext(struct thread *td, struct ia32_mcontext *mcp) { struct trapframe *tp; char *xfpustate; long rflags; int ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp)) return (EINVAL); rflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_IA32_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = ia32_set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_gs = mcp->mc_gs; tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_flags = TF_HASSEGS; tp->tf_rdi = mcp->mc_edi; tp->tf_rsi = mcp->mc_esi; tp->tf_rbp = mcp->mc_ebp; tp->tf_rbx = mcp->mc_ebx; tp->tf_rdx = mcp->mc_edx; tp->tf_rcx = mcp->mc_ecx; tp->tf_rax = mcp->mc_eax; /* trapno, err */ tp->tf_rip = mcp->mc_eip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* * The first two fields of a ucontext_t are the signal mask and * the machine context. The next field is uc_link; we want to * avoid destroying the link when copying out contexts. */ #define UC_COPY_SIZE offsetof(struct ia32_ucontext, uc_link) int freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap) { struct ia32_ucontext uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); bzero(&uc.__spare__, sizeof(uc.__spare__)); ret = copyout(&uc, uap->ucp, UC_COPY_SIZE); } return (ret); } int freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap) { struct ia32_ucontext uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = ia32_set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } return (ret == 0 ? EJUSTRETURN : ret); } int freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap) { struct ia32_ucontext uc; int ret; if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->oucp, UC_COPY_SIZE); if (ret == 0) { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = ia32_set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } } return (ret == 0 ? EJUSTRETURN : ret); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void ia32_osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe3 sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct ia32_sigframe3 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(sf)); td->td_sigstk.ss_flags |= SS_ONSTACK; } else fp = (struct ia32_sigframe3 *)regs->tf_rsp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ah = (uintptr_t)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ah = (uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_rax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_rbx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_rcx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_rdx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_rsi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_rdi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = regs->tf_gs; sf.sf_siginfo.si_sc.sc_isp = regs->tf_rsp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_esp = regs->tf_rsp; sf.sf_siginfo.si_sc.sc_ebp = regs->tf_rbp; sf.sf_siginfo.si_sc.sc_eip = regs->tf_rip; sf.sf_siginfo.si_sc.sc_eflags = regs->tf_rflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)fp; regs->tf_rip = p->p_sysent->sv_psstrings - sz_ia32_osigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe4 sf, *sfp; struct siginfo32 siginfo; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int oonstack; int sig; td = curthread; p = td->td_proc; siginfo_to_siginfo32(&ksi->ksi_info, &siginfo); PROC_LOCK_ASSERT(p, MA_OWNED); sig = siginfo.si_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx; sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx; sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx; sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax; sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno; sf.sf_uc.uc_mcontext.mc_err = regs->tf_err; sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip; sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; bzero(sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); bzero(sf.sf_uc.uc_mcontext.__spare__, sizeof(sf.sf_uc.uc_mcontext.__spare__)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct ia32_sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(sf)); } else sfp = (struct ia32_sigframe4 *)regs->tf_rsp - 1; PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; /* Fill in POSIX parts */ sf.sf_si = siginfo; sf.sf_si.si_signo = sig; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = siginfo.si_code; sf.sf_addr = (u_int32_t)siginfo.si_addr; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base + sz_ia32_sigcode - sz_freebsd4_ia32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe sf, *sfp; struct siginfo32 siginfo; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int oonstack; int sig; siginfo_to_siginfo32(&ksi->ksi_info, &siginfo); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = siginfo.si_signo; psp = p->p_sigacts; #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_ia32_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { ia32_osendsig(catcher, ksi, mask); return; } #endif mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx; sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx; sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx; sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax; sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno; sf.sf_uc.uc_mcontext.mc_err = regs->tf_err; sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip; sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase; bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; else sp = (char *)regs->tf_rsp; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(sf); /* Align to 16 bytes. */ sfp = (struct ia32_sigframe *)((uintptr_t)sp & ~0xF); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; /* Fill in POSIX parts */ sf.sf_si = siginfo; sf.sf_si.si_signo = sig; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = siginfo.si_code; sf.sf_addr = (u_int32_t)siginfo.si_addr; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, PTRIN(sf.sf_uc.uc_mcontext.mc_xfpustate), xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* XXXKIB leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #ifdef COMPAT_43 int ofreebsd32_sigreturn(struct thread *td, struct ofreebsd32_sigreturn_args *uap) { struct ia32_sigcontext3 sc, *scp; struct trapframe *regs; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_eflags; if (!EFL_SECURE(eflags, regs->tf_rflags)) { return (EINVAL); } if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; regs->tf_gs = scp->sc_gs; regs->tf_rax = scp->sc_eax; regs->tf_rbx = scp->sc_ebx; regs->tf_rcx = scp->sc_ecx; regs->tf_rdx = scp->sc_edx; regs->tf_rsi = scp->sc_esi; regs->tf_rdi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_rbp = scp->sc_ebp; regs->tf_rsp = scp->sc_esp; regs->tf_rip = scp->sc_eip; regs->tf_rflags = eflags; if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, SIGPROCMASK_OLD); set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } #endif #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_freebsd32_sigreturn(td, uap) struct thread *td; struct freebsd4_freebsd32_sigreturn_args /* { const struct freebsd4_freebsd32_ucontext *sigcntxp; } */ *uap; { struct ia32_ucontext4 uc; struct trapframe *regs; struct ia32_ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_rflags)) { uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_rdi = ucp->uc_mcontext.mc_edi; regs->tf_rsi = ucp->uc_mcontext.mc_esi; regs->tf_rbp = ucp->uc_mcontext.mc_ebp; regs->tf_rbx = ucp->uc_mcontext.mc_ebx; regs->tf_rdx = ucp->uc_mcontext.mc_edx; regs->tf_rcx = ucp->uc_mcontext.mc_ecx; regs->tf_rax = ucp->uc_mcontext.mc_eax; regs->tf_trapno = ucp->uc_mcontext.mc_trapno; regs->tf_err = ucp->uc_mcontext.mc_err; regs->tf_rip = ucp->uc_mcontext.mc_eip; regs->tf_cs = cs; regs->tf_rflags = ucp->uc_mcontext.mc_eflags; regs->tf_rsp = ucp->uc_mcontext.mc_esp; regs->tf_ss = ucp->uc_mcontext.mc_ss; regs->tf_ds = ucp->uc_mcontext.mc_ds; regs->tf_es = ucp->uc_mcontext.mc_es; regs->tf_fs = ucp->uc_mcontext.mc_fs; regs->tf_gs = ucp->uc_mcontext.mc_gs; kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int freebsd32_sigreturn(td, uap) struct thread *td; struct freebsd32_sigreturn_args /* { const struct freebsd32_ucontext *sigcntxp; } */ *uap; { struct ia32_ucontext uc; struct trapframe *regs; struct ia32_ucontext *ucp; char *xfpustate; size_t xfpustate_len; int cs, eflags, error, ret; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_rflags)) { uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((ucp->uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", td->td_proc->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate), xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", td->td_proc->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = ia32_set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", td->td_proc->p_pid, td->td_name, ret); return (ret); } regs->tf_rdi = ucp->uc_mcontext.mc_edi; regs->tf_rsi = ucp->uc_mcontext.mc_esi; regs->tf_rbp = ucp->uc_mcontext.mc_ebp; regs->tf_rbx = ucp->uc_mcontext.mc_ebx; regs->tf_rdx = ucp->uc_mcontext.mc_edx; regs->tf_rcx = ucp->uc_mcontext.mc_ecx; regs->tf_rax = ucp->uc_mcontext.mc_eax; regs->tf_trapno = ucp->uc_mcontext.mc_trapno; regs->tf_err = ucp->uc_mcontext.mc_err; regs->tf_rip = ucp->uc_mcontext.mc_eip; regs->tf_cs = cs; regs->tf_rflags = ucp->uc_mcontext.mc_eflags; regs->tf_rsp = ucp->uc_mcontext.mc_esp; regs->tf_ss = ucp->uc_mcontext.mc_ss; regs->tf_ds = ucp->uc_mcontext.mc_ds; regs->tf_es = ucp->uc_mcontext.mc_es; regs->tf_fs = ucp->uc_mcontext.mc_fs; regs->tf_gs = ucp->uc_mcontext.mc_gs; regs->tf_flags = TF_HASSEGS; kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * Clear registers on exec */ void ia32_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); #ifdef COMPAT_43 setup_lcall_gate(); #endif pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; pcb->pcb_initial_fpucw = __INITIAL_FPUCW_I386__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucode32sel; regs->tf_rbx = imgp->ps_strings; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; fpstate_drop(td); /* Return via doreti so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); - td->td_retval[1] = 0; } Index: projects/bsd_rdma_4_9/sys/amd64/linux32/linux32_sysvec.c =================================================================== --- projects/bsd_rdma_4_9/sys/amd64/linux32/linux32_sysvec.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/amd64/linux32/linux32_sysvec.c (revision 326162) @@ -1,1220 +1,1219 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define AUXARGS_ENTRY_32(pos, id, val) \ do { \ suword32(pos++, id); \ suword32(pos++, val); \ } while (0) #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX32_SYS_linux_rt_sendsig 0 #define LINUX32_SYS_linux_sendsig 0 const char *linux_kplatform; static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux32_locore_o_start; extern char _binary_linux32_locore_o_end; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static register_t *linux_copyout_strings(struct image_params *imgp); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack); static void linux32_fixlimit(struct rlimit *rl, int which); static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); base = (Elf32_Addr *)*stack_base; args = (Elf32_Auxargs *)imgp->auxargs; pos = base + (imgp->args->argc + imgp->args->envc + 2); issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall); AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY_32(pos, AT_BASE, args->base); AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary)); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp)); if (args->execfd != -1) AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY_32(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword32(base, (uint32_t)imgp->args->argc); *stack_base = (register_t *)base; return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn * and libgcc unwind. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; lmask.__mask = frame.sf_extramask[0]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an * alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = imgp->ps_strings; fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); - td->td_retval[1] = 0; } /* * XXX copied from ia32_sysvec.c. */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; u_int32_t *vectp; char *stringp, *destp; u_int32_t *stack_base; struct linux32_ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; /* * Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; destp = (caddr_t)arginfo - SPARE_USRSPACE - roundup(sizeof(canary), sizeof(char *)) - roundup(execpath_len, sizeof(char *)) - roundup(ARG_MAX - imgp->args->stringspace, sizeof(char *)); if (execpath_len != 0) { imgp->execpathp = (uintptr_t)arginfo - execpath_len; copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); imgp->canary = (uintptr_t)arginfo - roundup(execpath_len, sizeof(char *)) - roundup(sizeof(canary), sizeof(char *)); copyout(canary, (void *)imgp->canary, sizeof(canary)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(u_int32_t)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (u_int32_t *)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); /* * vectp also becomes our initial stack base */ stack_base = vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword32(vectp++, 0); suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword32(vectp, 0); return ((register_t *)stack_base); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); #if defined(DEBUG) SYSCTL_PROC(_compat_linux32, OID_AUTO, debug, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, linux_sysctl_debug, "A", "Linux debugging control"); #endif static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX32_SYS_MAXSYSCALL, .sv_table = linux32_sysent, .sv_mask = 0, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux32_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux32_locore_o_end - &_binary_linux32_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; linux_kplatform = linux_shared_page_mapping + (linux_platform - (caddr_t)elf_linux_sysvec.sv_shared_page_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj); }; SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); FEATURE(linux, "Linux 32bit support"); Index: projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_hw.c =================================================================== --- projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_hw.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_hw.c (revision 326162) @@ -1,1509 +1,1507 @@ /*- * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #include "io/iommu.h" #include "amdvi_priv.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); #define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) #define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) /* Print RID or device ID in PCI string format. */ #define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) static void amdvi_dump_cmds(struct amdvi_softc *softc); static void amdvi_print_dev_cap(struct amdvi_softc *softc); MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); extern device_t *ivhd_devs; extern int ivhd_count; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, 0, NULL); static int amdvi_enable_user = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, &amdvi_enable_user, 0, NULL); TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); #ifdef AMDVI_ATS_ENABLE /* XXX: ATS is not tested. */ static int amdvi_enable_iotlb = 1; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, &amdvi_enable_iotlb, 0, NULL); TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); #endif static int amdvi_host_ptp = 1; /* Use page tables for host. */ SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, &amdvi_host_ptp, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); /* Page table level used <= supported by h/w[v1=7]. */ static int amdvi_ptp_level = 4; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, &amdvi_ptp_level, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); /* Disable fault event reporting. */ static int amdvi_disable_io_fault = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, &amdvi_disable_io_fault, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, &amdvi_dom_id, 0, NULL); /* * Device table entry. * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). * = 256 * 2 * PAGE_SIZE. */ static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); CTASSERT(PCI_NUM_DEV_MAX == 0x10000); CTASSERT(sizeof(amdvi_dte) == 0x200000); static SLIST_HEAD (, amdvi_domain) dom_head; static inline void amdvi_pci_write(struct amdvi_softc *softc, int off, uint32_t data) { pci_cfgregwrite(PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), off, data, 4); } static inline uint32_t amdvi_pci_read(struct amdvi_softc *softc, int off) { return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), off, 4)); } static int amdvi_find_pci_cap(struct amdvi_softc *softc, uint8_t capability, int *off) { uint32_t read; uint8_t ptr; read = amdvi_pci_read(softc, PCIR_COMMAND); if (((read >> 16) & PCIM_STATUS_CAPPRESENT) == 0) return (ENXIO); /* Read the starting of capability pointer. */ read = amdvi_pci_read(softc, PCIR_CAP_PTR); ptr = read & 0xFF; while (ptr != 0) { read = amdvi_pci_read(softc, ptr); if ((read & 0xFF) == capability) { *off = ptr; return (0); } ptr = (read >> 8) & 0xFF; } return (ENOENT); } #ifdef AMDVI_ATS_ENABLE /* XXX: Should be in pci.c */ /* * Check if device has ATS capability and its enabled. * If ATS is absent or disabled, return (-1), otherwise ATS * queue length. */ static int amdvi_find_ats_qlen(uint16_t devid) { device_t dev; uint32_t off, cap; int qlen = -1; dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), PCI_RID2FUNC(devid)); if (!dev) { return (-1); } #define PCIM_ATS_EN BIT(31) if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { cap = pci_read_config(dev, off + 4, 4); qlen = (cap & 0x1F); qlen = qlen ? qlen : 32; printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", RID2PCI_STR(devid), (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", qlen); qlen = (cap & PCIM_ATS_EN) ? qlen : -1; } return (qlen); } /* * Check if an endpoint device support device IOTLB or ATS. */ static inline bool amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct ivhd_dev_cfg *cfg; int qlen, i; bool pci_ats, ivhd_ats; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) return (false); KASSERT(softc, ("softc is NULL")); cfg = softc->dev_cfg; ivhd_ats = false; for (i = 0; i < softc->dev_cfg_cnt; i++) { if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { ivhd_ats = cfg->enable_ats; break; } cfg++; } pci_ats = (qlen < 0) ? false : true; if (pci_ats != ivhd_ats) device_printf(softc->dev, "BIOS bug: mismatch in ATS setting for %d.%d.%d," "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); /* Ignore IVRS setting and respect PCI setting. */ return (pci_ats); } #endif /* Enable IOTLB support for IOMMU if its supported. */ static inline void amdvi_hw_enable_iotlb(struct amdvi_softc *softc) { #ifndef AMDVI_ATS_ENABLE softc->iotlb = false; #else bool supported; supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { if (!supported) device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); if (supported && !amdvi_enable_iotlb) { device_printf(softc->dev, "IOTLB disabled by user.\n"); supported = false; } } else supported = false; softc->iotlb = supported; #endif } static int amdvi_init_cmd(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl = softc->ctrl; ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ softc->cmd_max = 1 << ctrl->cmd.len; softc->cmd = malloc(sizeof(struct amdvi_cmd) * softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->cmd & PAGE_MASK) panic("AMDVi: Command buffer not aligned on page boundary."); ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; /* * XXX: Reset the h/w pointers in case IOMMU is restarting, * h/w doesn't clear these pointers based on empirical data. */ ctrl->cmd_tail = 0; ctrl->cmd_head = 0; return (0); } /* * Note: Update tail pointer after we have written the command since tail * pointer update cause h/w to execute new commands, see section 3.3 * of AMD IOMMU spec ver 2.0. */ /* Get the command tail pointer w/o updating it. */ static struct amdvi_cmd * amdvi_get_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *tail; KASSERT(softc, ("softc is NULL")); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + ctrl->cmd_tail); return (tail); } /* * Update the command tail pointer which will start command execution. */ static void amdvi_update_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int size; size = sizeof(struct amdvi_cmd); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); softc->total_cmd++; #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); #endif } /* * Various commands supported by IOMMU. */ /* Completion wait command. */ static void amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) { struct amdvi_cmd *cmd; uint64_t pa; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); pa = vtophys(&softc->cmp_data); cmd->opcode = AMDVI_CMP_WAIT_OPCODE; cmd->word0 = (pa & 0xFFFFFFF8) | (AMDVI_CMP_WAIT_STORE); //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); cmd->word1 = (pa >> 32) & 0xFFFFF; cmd->addr = data; amdvi_update_cmd_tail(softc); } /* Invalidate device table entry. */ static void amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_DTE_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); #endif } /* Invalidate IOMMU page, use for invalidation of domain. */ static void amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, uint64_t addr, bool guest_nested, bool pde, bool page) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_PAGE_OPCODE; cmd->word1 = domain_id; /* * Invalidate all addresses for this domain. */ cmd->addr = addr; cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; amdvi_update_cmd_tail(softc); } #ifdef AMDVI_ATS_ENABLE /* Invalidate device IOTLB. */ static void amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; int qlen; if (!softc->iotlb) return; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) { panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", qlen, RID2PCI_STR(devid)); } cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" " Qlen:%d\n", devid, qlen); #endif cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; cmd->word0 = devid; cmd->word1 = qlen; cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | AMDVI_INVD_IOTLB_S; amdvi_update_cmd_tail(softc); } #endif #ifdef notyet /* For Interrupt Remap. */ static void amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_INTR_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); #endif } #endif /* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ static void amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); /* * See section 3.3.3 of IOMMU spec rev 2.0, software note * for invalidating domain. */ amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, false, true, true); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); #endif } static bool amdvi_cmp_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; const uint64_t VERIFY = 0xA5A5; volatile uint64_t *read; int i; bool status; ctrl = softc->ctrl; read = &softc->cmp_data; *read = 0; amdvi_cmd_cmp(softc, VERIFY); /* Wait for h/w to update completion data. */ for (i = 0; i < 100 && (*read != VERIFY); i++) { DELAY(1000); /* 1 ms */ } status = (VERIFY == softc->cmp_data) ? true : false; #ifdef AMDVI_DEBUG_CMD if (status) device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, ctrl->cmd_head, loop); #endif return (status); } static void amdvi_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int i; KASSERT(softc, ("softc is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); /* Don't wait if h/w is not enabled. */ if ((ctrl->control & AMDVI_CTRL_EN) == 0) return; for (i = 0; i < 10; i++) { if (amdvi_cmp_wait(softc)) return; } device_printf(softc->dev, "Error: completion failed" " tail:0x%x, head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); amdvi_dump_cmds(softc); } static void amdvi_dump_cmds(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *cmd; int off, i; ctrl = softc->ctrl; device_printf(softc->dev, "Dump all the commands:\n"); /* * If h/w is stuck in completion, it is the previous command, * start dumping from previous command onward. */ off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), softc->cmd_max); for (i = 0; off != ctrl->cmd_tail && i < softc->cmd_max; i++) { cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" " 0x%x 0x%lx\n", i, off, cmd->opcode, cmd->word0, cmd->word1, cmd->addr); off = (off + sizeof(struct amdvi_cmd)) % (softc->cmd_max * sizeof(struct amdvi_cmd)); } } static int amdvi_init_event(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->event.len = 8; softc->event_max = 1 << ctrl->event.len; softc->event = malloc(sizeof(struct amdvi_event) * softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->event & PAGE_MASK) { device_printf(softc->dev, "Event buffer not aligned on page."); return (false); } ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; /* Reset the pointers. */ ctrl->evt_head = 0; ctrl->evt_tail = 0; return (0); } static inline void amdvi_decode_evt_flag(uint16_t flag) { flag &= AMDVI_EVENT_FLAG_MASK; - printf("0x%b]\n", flag, + printf(" 0x%b]\n", flag, "\020" "\001GN" "\002NX" "\003US" "\004I" "\005PR" "\006RW" "\007PE" "\010RZ" "\011TR" ); } /* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ static inline void amdvi_decode_evt_flag_type(uint8_t type) { switch (AMDVI_EVENT_FLAG_TYPE(type)) { case 0: printf("RSVD\n"); break; case 1: printf("Master Abort\n"); break; case 2: printf("Target Abort\n"); break; case 3: printf("Data Err\n"); break; default: break; } } static void amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(flag); } static void amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); } static void amdvi_decode_evt(struct amdvi_event *evt) { struct amdvi_cmd *cmd; switch (evt->opcode) { case AMDVI_EVENT_INVALID_DTE: amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PFAULT: amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_DTE_HW_ERROR: amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PAGE_HW_ERROR: amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_ILLEGAL_CMD: /* FALL THROUGH */ case AMDVI_EVENT_CMD_HW_ERROR: - printf("\t[%s EVT]", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? "ILLEGAL CMD" : "CMD HW ERR"); cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", cmd->opcode, cmd->word0, cmd->word1, cmd->addr); break; case AMDVI_EVENT_IOTLB_TIMEOUT: - printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx", + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", evt->devid, evt->addr); break; case AMDVI_EVENT_INVALID_DTE_REQ: - printf("\t[INV_DTE devid:0x%x addr:0x%lx", - evt->devid, evt->addr); + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); break; case AMDVI_EVENT_INVALID_PPR_REQ: case AMDVI_EVENT_COUNTER_ZERO: printf("AMD-Vi: v2 events.\n"); break; default: - printf("Unsupported AMD-Vi event:%d", evt->opcode); + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); } } static void amdvi_print_events(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_event *event; int i, size; ctrl = softc->ctrl; size = sizeof(struct amdvi_event); for (i = 0; i < softc->event_max; i++) { event = &softc->event[ctrl->evt_head / size]; if (!event->opcode) break; device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", i, ctrl->evt_head, ctrl->evt_tail); amdvi_decode_evt(event); ctrl->evt_head = MOD_INC(ctrl->evt_head, size, softc->event_max); } } static int amdvi_init_dte(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; ctrl->dte.size = 0x1FF; /* 2MB device table. */ return (0); } /* * Not all capabilities of IOMMU are available in ACPI IVHD flag * or EFR entry, read directly from device. */ static int amdvi_print_pci_cap(device_t dev) { struct amdvi_softc *softc; uint32_t off, cap; softc = device_get_softc(dev); off = softc->cap_off; /* * Section 3.7.1 of IOMMU sepc rev 2.0. * Read capability from device. */ cap = amdvi_pci_read(softc, off); /* Make sure capability type[18:16] is 3. */ KASSERT((((cap >> 16) & 0x7) == 0x3), ("Not a IOMMU capability 0x%x@0x%x", cap, off)); softc->pci_cap = cap >> 24; device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", cap, off, softc->pci_cap, "\020\001IOTLB\002HT\003NPCache\004EFR"); /* IOMMU spec Rev 2.0, section 3.7.2.1 */ softc->pci_efr = softc->ctrl->ex_feature; if (softc->pci_efr) { device_printf(softc->dev, "PCI extended Feature:%b\n", (int)softc->pci_efr, "\020\001PreFSup\002PPRSup\003XTSup\004NXSup\006IASup" "\007GASup\008HESup\009PCSup"); device_printf(softc->dev, "PCI HATS = %d GATS = %d GLXSup = %d, max PASID: 0x%x ", (int)((softc->pci_efr >> 10) & 0x3), (int)((softc->pci_efr >> 12) & 0x3), (int)((softc->pci_efr >> 14) & 0x3), (int)((softc->pci_efr >> 32) & 0x1F) + 1); } return (0); } static void amdvi_event_intr(void *arg) { struct amdvi_softc *softc; struct amdvi_ctrl *ctrl; softc = (struct amdvi_softc *)arg; ctrl = softc->ctrl; device_printf(softc->dev, "EVT INTR %ld Status:0x%x" " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, ctrl->status, ctrl->evt_head, ctrl->evt_tail); printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); amdvi_print_events(softc); } static void amdvi_free_evt_intr_res(device_t dev) { struct amdvi_softc *softc; softc = device_get_softc(dev); if (softc->event_tag != NULL) { bus_teardown_intr(dev, softc->event_res, softc->event_tag); } if (softc->event_res != NULL) { bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, softc->event_res); } bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), dev, 1, &softc->event_irq); } static bool amdvi_alloc_intr_resources(struct amdvi_softc *softc) { device_t dev, pcib; uint64_t msi_addr; uint32_t msi_data, temp; int err, msi_off; dev = softc->dev; pcib = device_get_parent(device_get_parent(dev)); softc->event_irq = -1; softc->event_rid = 0; /* * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one * interrupt. XXX: Enable MSI/X support. */ err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); if (err) { device_printf(dev, "Couldn't find event MSI IRQ resource.\n"); return (ENOENT); } err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, softc->event_irq, 1); if (err) { device_printf(dev, "Couldn't set event MSI resource.\n"); return (ENXIO); } softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &softc->event_rid, RF_ACTIVE); if (!softc->event_res) { device_printf(dev, "Unable to allocate event INTR resource.\n"); return (ENOMEM); } if (bus_setup_intr(dev, softc->event_res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, softc, &softc->event_tag)) { device_printf(dev, "Fail to setup event intr\n"); bus_release_resource(softc->dev, SYS_RES_IRQ, softc->event_rid, softc->event_res); softc->event_res = NULL; return (ENXIO); } bus_describe_intr(dev, softc->event_res, softc->event_tag, "fault"); err = amdvi_find_pci_cap(softc, PCIY_MSI, &msi_off); if (err) { device_printf(dev, "Couldn't find MSI capability, err = %d.\n", err); return (err); } err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, &msi_data); if (err) { device_printf(dev, "Event interrupt config failed, err=%d.\n", err); amdvi_free_evt_intr_res(softc->dev); return (err); } /* Configure MSI */ amdvi_pci_write(softc, msi_off + PCIR_MSI_ADDR, msi_addr); amdvi_pci_write(softc, msi_off + PCIR_MSI_ADDR_HIGH, msi_addr >> 32); amdvi_pci_write(softc, msi_off + PCIR_MSI_DATA_64BIT, msi_data); /* Now enable MSI interrupt. */ temp = amdvi_pci_read(softc, msi_off); temp |= (PCIM_MSICTRL_MSI_ENABLE << 16); /* MSI enable. */ amdvi_pci_write(softc, msi_off, temp); return (0); } static void amdvi_print_dev_cap(struct amdvi_softc *softc) { struct ivhd_dev_cfg *cfg; int i; cfg = softc->dev_cfg; for (i = 0; i < softc->dev_cfg_cnt; i++) { device_printf(softc->dev, "device [0x%x - 0x%x]" "config:%b%s\n", cfg->start_id, cfg->end_id, cfg->data, "\020\001INIT\002ExtInt\003NMI" "\007LINT0\008LINT1", cfg->enable_ats ? "ATS enabled" : ""); cfg++; } } static int amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) { struct amdvi_softc *softc; int result, type, error = 0; softc = (struct amdvi_softc *)arg1; type = arg2; switch (type) { case 0: result = softc->ctrl->cmd_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 1: result = softc->ctrl->cmd_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; case 2: result = softc->ctrl->evt_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 3: result = softc->ctrl->evt_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; default: device_printf(softc->dev, "Unknown sysctl:%d\n", type); } return (error); } static void amdvi_add_sysctl(struct amdvi_softc *softc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev; dev = softc->dev; ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, &softc->event_intr_cnt, "Event interrupt count"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, &softc->total_cmd, "Command submitted count"); - SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, - (int *)&softc->pci_rid, 0, - "IOMMU RID"); - SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, - (int *)&softc->start_dev_rid, 0, - "Start of device under this IOMMU"); - SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, - (int *)&softc->end_dev_rid, 0, - "End of device under this IOMMU"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, + &softc->start_dev_rid, 0, "Start of device under this IOMMU"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, + &softc->end_dev_rid, 0, "End of device under this IOMMU"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", CTLTYPE_UINT | CTLFLAG_RD, softc, 0, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", CTLTYPE_UINT | CTLFLAG_RD, softc, 1, amdvi_handle_sysctl, "IU", "Command tail"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", CTLTYPE_UINT | CTLFLAG_RD, softc, 2, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", CTLTYPE_UINT | CTLFLAG_RD, softc, 3, amdvi_handle_sysctl, "IU", "Command tail"); } int amdvi_setup_hw(struct amdvi_softc *softc) { device_t dev; int status; dev = softc->dev; amdvi_hw_enable_iotlb(softc); amdvi_print_dev_cap(softc); if ((status = amdvi_print_pci_cap(dev)) != 0) { device_printf(dev, "PCI capability.\n"); return (status); } if ((status = amdvi_init_cmd(softc)) != 0) { device_printf(dev, "Couldn't configure command buffer.\n"); return (status); } if ((status = amdvi_init_event(softc)) != 0) { device_printf(dev, "Couldn't configure event buffer.\n"); return (status); } if ((status = amdvi_init_dte(softc)) != 0) { device_printf(dev, "Couldn't configure device table.\n"); return (status); } if ((status = amdvi_alloc_intr_resources(softc)) != 0) { return (status); } amdvi_add_sysctl(softc); return (0); } int amdvi_teardown_hw(struct amdvi_softc *softc) { device_t dev; dev = softc->dev; /* * Called after disable, h/w is stopped by now, free all the resources. */ amdvi_free_evt_intr_res(dev); if (softc->cmd) free(softc->cmd, M_AMDVI); if (softc->event) free(softc->event, M_AMDVI); return (0); } /*********** bhyve interfaces *********************/ static int amdvi_init(void) { if (!ivhd_count) { return (EIO); } if (!amdvi_enable_user && ivhd_count) { printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " "use hw.vmm.amdvi_enable=1 to enable pass-through.\n", ivhd_count); return (EINVAL); } return (0); } static void amdvi_cleanup(void) { /* Nothing. */ } static uint16_t amdvi_domainId(void) { /* * If we hit maximum domain limit, rollover leaving host * domain(0). * XXX: make sure that this domain is not used. */ if (amdvi_dom_id == AMDVI_MAX_DOMAIN) amdvi_dom_id = 1; return ((uint16_t)amdvi_dom_id++); } static void amdvi_do_inv_domain(uint16_t domain_id, bool create) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL")); /* * If not present pages are cached, invalidate page after * creating domain. */ #if 0 if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) continue; #endif amdvi_inv_domain(softc, domain_id); amdvi_wait(softc); } } static void * amdvi_create_domain(vm_paddr_t maxaddr) { struct amdvi_domain *dom; dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); dom->id = amdvi_domainId(); //dom->maxaddr = maxaddr; #ifdef AMDVI_DEBUG_CMD printf("Created domain #%d\n", dom->id); #endif /* * Host domain(#0) don't create translation table. */ if (dom->id || amdvi_host_ptp) dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); dom->ptp_level = amdvi_ptp_level; amdvi_do_inv_domain(dom->id, true); SLIST_INSERT_HEAD(&dom_head, dom, next); return (dom); } static void amdvi_free_ptp(uint64_t *ptp, int level) { int i; if (level < 1) return; for (i = 0; i < NPTEPG ; i++) { if ((ptp[i] & AMDVI_PT_PRESENT) == 0) continue; /* XXX: Add super-page or PTE mapping > 4KB. */ #ifdef notyet /* Super-page mapping. */ if (AMDVI_PD_SUPER(ptp[i])) continue; #endif amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] & AMDVI_PT_MASK), level - 1); } free(ptp, M_AMDVI); } static void amdvi_destroy_domain(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Destroying domain %d\n", domain->id); #endif if (domain->ptp) amdvi_free_ptp(domain->ptp, domain->ptp_level); amdvi_do_inv_domain(domain->id, false); SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); free(domain, M_AMDVI); } static uint64_t amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t pg_size, bool create) { uint64_t *page, pa; int shift, index; const int PT_SHIFT = 9; const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ if (!pg_size) return (0); if (hpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } if (gpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } shift = PML4SHIFT; while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { index = (gpa >> shift) & PT_INDEX_MASK; if ((pt[index] == 0) && create) { page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); pa = vtophys(page); pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | ((level - 1) << AMDVI_PD_LEVEL_SHIFT); } #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif #define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) pa = PTE2PA(pt[index]); pt = (uint64_t *)PHYS_TO_DMAP(pa); shift -= PT_SHIFT; level--; } /* Leaf entry. */ index = (gpa >> shift) & PT_INDEX_MASK; if (create) { pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; } else pt[index] = 0; #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[Last level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif return (1ULL << shift); } static uint64_t amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t size, bool create) { uint64_t mapped, *ptp, len; int level; KASSERT(domain, ("domain is NULL")); level = domain->ptp_level; KASSERT(level, ("Page table level is 0")); ptp = domain->ptp; KASSERT(ptp, ("PTP is NULL")); mapped = 0; while (mapped < size) { len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, PAGE_SIZE, create); if (!len) { printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", hpa, gpa); return (0); } mapped += len; } return (mapped); } static uint64_t amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; if (domain->id && !domain->ptp) { printf("ptp is NULL"); return (-1); } /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) return (amdvi_update_mapping(domain, gpa, hpa, len, true)); else return (len); } static uint64_t amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) return (amdvi_update_mapping(domain, gpa, 0, len, false)); return (len); } static struct amdvi_softc * amdvi_find_iommu(uint16_t devid) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); if ((devid >= softc->start_dev_rid) && (devid <= softc->end_dev_rid)) return (softc); } /* * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. */ printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", RID2PCI_STR(devid)); return (device_get_softc(ivhd_devs[0])); } /* * Set-up device table entry. * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must * be set concurrently, e.g. read and write bits. */ static void amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) { struct amdvi_softc *softc; struct amdvi_dte temp; softc = amdvi_find_iommu(devid); KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); memset(&temp, 0, sizeof(struct amdvi_dte)); #ifdef AMDVI_ATS_ENABLE /* If IOMMU and device support IOTLB, enable it. */ if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) temp.iotlb_enable = 1; #endif /* Avoid duplicate I/O faults. */ temp.sup_second_io_fault = 1; temp.sup_all_io_fault = amdvi_disable_io_fault; temp.dt_valid = 1; temp.domain_id = domain->id; if (enable) { if (domain->ptp) { temp.pt_base = vtophys(domain->ptp) >> 12; temp.pt_level = amdvi_ptp_level; } /* * XXX: Page table valid[TV] bit must be set even if host domain * page tables are not enabled. */ temp.pt_valid = 1; temp.read_allow = 1; temp.write_allow = 1; } amdvi_dte[devid] = temp; } static void amdvi_inv_device(uint16_t devid) { struct amdvi_softc *softc; softc = amdvi_find_iommu(devid); KASSERT(softc, ("softc is NULL")); amdvi_cmd_inv_dte(softc, devid); #ifdef AMDVI_ATS_ENABLE if (amdvi_dev_support_iotlb(softc, devid)) amdvi_cmd_inv_iotlb(softc, devid); #endif amdvi_wait(softc); } static void amdvi_add_device(void *arg, uint16_t devid) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain != NULL, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Assigning device(%d.%d.%d) to domain:%d\n", RID2PCI_STR(devid), domain->id); #endif amdvi_set_dte(domain, devid, true); amdvi_inv_device(devid); } static void amdvi_remove_device(void *arg, uint16_t devid) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; #ifdef AMDVI_DEBUG_CMD printf("Remove device(0x%x) from domain:%d\n", devid, domain->id); #endif amdvi_set_dte(domain, devid, false); amdvi_inv_device(devid); } static void amdvi_enable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; uint64_t val; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); val = ( AMDVI_CTRL_EN | AMDVI_CTRL_CMD | AMDVI_CTRL_ELOG | AMDVI_CTRL_ELOGINT | AMDVI_CTRL_INV_TO_1S); if (softc->ivhd_flag & IVHD_FLAG_COH) val |= AMDVI_CTRL_COH; if (softc->ivhd_flag & IVHD_FLAG_HTT) val |= AMDVI_CTRL_HTT; if (softc->ivhd_flag & IVHD_FLAG_RPPW) val |= AMDVI_CTRL_RPPW; if (softc->ivhd_flag & IVHD_FLAG_PPW) val |= AMDVI_CTRL_PPW; if (softc->ivhd_flag & IVHD_FLAG_ISOC) val |= AMDVI_CTRL_ISOC; ctrl->control = val; } } static void amdvi_disable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); ctrl->control = 0; } } static void amdvi_inv_tlb(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); amdvi_do_inv_domain(domain->id, false); } struct iommu_ops iommu_ops_amd = { amdvi_init, amdvi_cleanup, amdvi_enable, amdvi_disable, amdvi_create_domain, amdvi_destroy_domain, amdvi_create_mapping, amdvi_destroy_mapping, amdvi_add_device, amdvi_remove_device, amdvi_inv_tlb }; Index: projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_priv.h =================================================================== --- projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_priv.h (revision 326161) +++ projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_priv.h (revision 326162) @@ -1,395 +1,395 @@ /*- * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _AMDVI_PRIV_H_ #define _AMDVI_PRIV_H_ #define BIT(n) (1ULL << (n)) /* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ #define REG_BITS(x, n, m) (((x) >> (m)) & \ ((1 << (((n) - (m)) + 1)) - 1)) /* * IOMMU PCI capability. */ #define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ #define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ #define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ #define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ #define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ /* * IOMMU extended features. */ #define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ #define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ #define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ #define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ #define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ #define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ #define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ #define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ #define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ #define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ /* XXX: add more EFER bits. */ /* * Device table entry or DTE * NOTE: Must be 256-bits/32 bytes aligned. */ struct amdvi_dte { uint32_t dt_valid:1; /* Device Table valid. */ uint32_t pt_valid:1; /* Page translation valid. */ uint16_t :7; /* Reserved[8:2] */ uint8_t pt_level:3; /* Paging level, 0 to disable. */ uint64_t pt_base:40; /* Page table root pointer. */ uint8_t :3; /* Reserved[54:52] */ uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ uint8_t gv_level:2; /* Revision 2, GLX level. */ uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ uint8_t read_allow:1; /* I/O read enabled. */ uint8_t write_allow:1; /* I/O write enabled. */ uint8_t :1; /* Reserved[63] */ uint16_t domain_id:16; /* Domain ID */ uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ uint8_t iotlb_enable:1; /* Device support IOTLB */ uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ uint8_t IOctl:2; /* Port I/O control. */ uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ uint8_t snoop_disable:1; /* Snoop disable. */ uint8_t allow_ex:1; /* Allow exclusion. */ uint8_t sysmgmt:2; /* System management message.*/ uint8_t :1; /* Reserved[106] */ uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ uint8_t intmap_valid:1; /* Interrupt map valid. */ uint8_t intmap_len:4; /* Interrupt map table length. */ uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ uint64_t intmap_base:46; /* IntMap base. */ uint8_t :4; /* Reserved[183:180] */ uint8_t init_pass:1; /* INIT pass through or PT */ uint8_t extintr_pass:1; /* External Interrupt PT */ uint8_t nmi_pass:1; /* NMI PT */ uint8_t :1; /* Reserved[187] */ uint8_t intr_ctrl:2; /* Interrupt control */ uint8_t lint0_pass:1; /* LINT0 PT */ uint8_t lint1_pass:1; /* LINT1 PT */ uint64_t :64; /* Reserved[255:192] */ } __attribute__((__packed__)); CTASSERT(sizeof(struct amdvi_dte) == 32); /* * IOMMU command entry. */ struct amdvi_cmd { uint32_t word0; uint32_t word1:28; uint8_t opcode:4; uint64_t addr; } __attribute__((__packed__)); /* Command opcodes. */ #define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ #define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ #define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ #define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ #define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ #define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ #define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ #define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ /* Completion wait attributes. */ #define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ #define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ #define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ /* Invalidate page. */ #define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ #define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ #define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ #define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) /* Invalidate IOTLB. */ #define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ #define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ #define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) /* XXX: add more command entries. */ /* * IOMMU event entry. */ struct amdvi_event { uint16_t devid; uint16_t pasid_hi; uint16_t pasid_domid; /* PASID low or DomainID */ uint16_t flag:12; uint8_t opcode:4; uint64_t addr; } __attribute__((__packed__)); CTASSERT(sizeof(struct amdvi_event) == 16); /* Various event types. */ #define AMDVI_EVENT_INVALID_DTE 0x1 #define AMDVI_EVENT_PFAULT 0x2 #define AMDVI_EVENT_DTE_HW_ERROR 0x3 #define AMDVI_EVENT_PAGE_HW_ERROR 0x4 #define AMDVI_EVENT_ILLEGAL_CMD 0x5 #define AMDVI_EVENT_CMD_HW_ERROR 0x6 #define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 #define AMDVI_EVENT_INVALID_DTE_REQ 0x8 #define AMDVI_EVENT_INVALID_PPR_REQ 0x9 #define AMDVI_EVENT_COUNTER_ZERO 0xA #define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ #define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) /* * IOMMU control block. */ struct amdvi_ctrl { struct { uint16_t size:9; uint16_t :3; uint64_t base:40; /* Devtable register base. */ uint16_t :12; } dte; struct { uint16_t :12; uint64_t base:40; uint8_t :4; uint8_t len:4; uint8_t :4; } cmd; struct { uint16_t :12; uint64_t base:40; uint8_t :4; uint8_t len:4; uint8_t :4; } event; uint16_t control :13; uint64_t :51; struct { uint8_t enable:1; uint8_t allow:1; uint16_t :10; uint64_t base:40; uint16_t :12; uint16_t :12; uint64_t limit:40; uint16_t :12; } excl; /* * Revision 2 only. */ uint64_t ex_feature; struct { uint16_t :12; uint64_t base:40; uint8_t :4; uint8_t len:4; uint8_t :4; } ppr; uint64_t first_event; uint64_t second_event; uint64_t event_status; /* Revision 2 only, end. */ uint8_t pad1[0x1FA8]; /* Padding. */ uint32_t cmd_head:19; uint64_t :45; uint32_t cmd_tail:19; uint64_t :45; uint32_t evt_head:19; uint64_t :45; uint32_t evt_tail:19; uint64_t :45; - uint64_t :56; - uint8_t status:8; + uint32_t status:19; + uint64_t :45; uint64_t pad2; uint8_t :4; uint16_t ppr_head:15; uint64_t :45; uint8_t :4; uint16_t ppr_tail:15; uint64_t :45; uint8_t pad3[0x1FC0]; /* Padding. */ /* XXX: More for rev2. */ } __attribute__((__packed__)); CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); #define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ /* * AMF IOMMU v2 size including event counters */ #define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); /* IVHD flag */ #define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ #define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ #define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ #define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ #define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ #define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ #define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ #define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ /* IVHD device entry data setting. */ #define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ #define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ /* Bit[5:4] for System Mgmt. Bit3 is reserved. */ #define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ #define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ #define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ /* IVHD 8-byte extended data settings. */ #define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ /* IOMMU control register. */ #define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ #define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ #define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ #define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ #define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ #define AMDVI_CTRL_PPW BIT(8) #define AMDVI_CTRL_RPPW BIT(9) #define AMDVI_CTRL_COH BIT(10) #define AMDVI_CTRL_ISOC BIT(11) #define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ #define AMDVI_CTRL_PPRLOG BIT(13) #define AMDVI_CTRL_PPRINT BIT(14) #define AMDVI_CTRL_PPREN BIT(15) #define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ #define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ /* Invalidation timeout. */ #define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ #define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ #define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ #define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ #define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ #define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ #define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ /* * Max number of PCI devices. * 256 bus x 32 slot/devices x 8 functions. */ #define PCI_NUM_DEV_MAX 0x10000 /* Maximum number of domains supported by IOMMU. */ #define AMDVI_MAX_DOMAIN (BIT(16) - 1) /* * IOMMU Page Table attributes. */ #define AMDVI_PT_PRESENT BIT(0) #define AMDVI_PT_COHERENT BIT(60) #define AMDVI_PT_READ BIT(61) #define AMDVI_PT_WRITE BIT(62) #define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) #define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ #define AMDVI_PD_LEVEL_SHIFT 9 #define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) /* * IOMMU Status, offset 0x2020 */ #define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ #define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ /* Completion wait command completed. */ #define AMDVI_STATUS_CMP BIT(2) #define IVRS_CTRL_RID 1 /* MMIO RID */ /* ACPI IVHD */ struct ivhd_dev_cfg { uint32_t start_id; uint32_t end_id; uint8_t data; /* Device configuration. */ bool enable_ats; /* ATS enabled for the device. */ int ats_qlen; /* ATS invalidation queue depth. */ }; struct amdvi_domain { uint64_t *ptp; /* Highest level page table */ int ptp_level; /* Level of page tables */ u_int id; /* Domain id */ SLIST_ENTRY (amdvi_domain) next; }; /* * AMD IOMMU softc. */ struct amdvi_softc { struct amdvi_ctrl *ctrl; /* Control area. */ device_t dev; /* IOMMU device. */ bool iotlb; /* IOTLB supported by IOMMU */ struct amdvi_cmd *cmd; /* Command descriptor area. */ int cmd_max; /* Max number of commands. */ uint64_t cmp_data; /* Command completion write back. */ struct amdvi_event *event; /* Event descriptor area. */ struct resource *event_res; /* Event interrupt resource. */ void *event_tag; /* Event interrupt tag. */ int event_max; /* Max number of events. */ int event_irq; int event_rid; /* ACPI various flags. */ uint32_t ivhd_flag; /* ACPI IVHD flag. */ uint32_t ivhd_efr; /* ACPI v1 Reserved or v2 EFR . */ /* PCI related. */ uint16_t cap_off; /* PCI Capability offset. */ uint8_t pci_cap; /* PCI capability. */ uint64_t pci_efr; /* PCI EFR for rev2.0 */ uint16_t pci_seg; /* IOMMU PCI domain/segment. */ uint16_t pci_rid; /* PCI BDF of IOMMU */ /* Device range under this IOMMU. */ uint16_t start_dev_rid; /* First device under this IOMMU. */ uint16_t end_dev_rid; /* Last device under this IOMMU. */ /* BIOS provided device configuration for end points. */ struct ivhd_dev_cfg dev_cfg[10]; int dev_cfg_cnt; /* Software statistics. */ uint64_t event_intr_cnt; /* Total event INTR count. */ uint64_t total_cmd; /* Total number of commands. */ }; int amdvi_setup_hw(struct amdvi_softc *softc); int amdvi_teardown_hw(struct amdvi_softc *softc); #endif /* _AMDVI_PRIV_H_ */ Index: projects/bsd_rdma_4_9/sys/amd64/vmm/amd/ivrs_drv.c =================================================================== --- projects/bsd_rdma_4_9/sys/amd64/vmm/amd/ivrs_drv.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/amd64/vmm/amd/ivrs_drv.c (revision 326162) @@ -1,500 +1,512 @@ /*- * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include "io/iommu.h" #include "amdvi_priv.h" device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ int ivhd_count; /* Number of IVHD or AMD-Vi devices. */ extern int amdvi_ptp_level; /* Page table levels. */ typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER * ptr, void *arg); /* * Iterate IVRS table for IVHD and IVMD device type. */ static void ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) { ACPI_TABLE_IVRS *ivrs; ACPI_IVRS_HEADER *ivrs_hdr, *end; ACPI_STATUS status; status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); if (ACPI_FAILURE(status)) return; if (ivrs->Header.Length == 0) { return; } ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); while (ivrs_hdr < end) { if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", ivrs_hdr->Length); break; } switch (ivrs_hdr->Type) { case ACPI_IVRS_TYPE_HARDWARE: /* Legacy */ case 0x11: case 0x40: /* ACPI HID */ if (!iter(ivrs_hdr, arg)) return; break; - + case ACPI_IVRS_TYPE_MEMORY1: case ACPI_IVRS_TYPE_MEMORY2: case ACPI_IVRS_TYPE_MEMORY3: if (!iter(ivrs_hdr, arg)) return; break; - + default: printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); } ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + ivrs_hdr->Length); } } static int ivrs_is_ivhd(UINT8 type) { if ((type == ACPI_IVRS_TYPE_HARDWARE) || (type == 0x11) || (type == 0x40)) return (1); return (0); } /* Count the number of AMD-Vi devices in the system. */ static int ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) { if (ivrs_is_ivhd(ivrs_he->Type)) ivhd_count++; return (1); } struct find_ivrs_hdr_args { int i; ACPI_IVRS_HEADER *ptr; }; static int ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) { struct find_ivrs_hdr_args *fi; fi = (struct find_ivrs_hdr_args *)args; if (ivrs_is_ivhd(ivrs_hdr->Type)) { if (fi->i == 0) { fi->ptr = ivrs_hdr; return (0); } fi->i--; } return (1); } static ACPI_IVRS_HARDWARE * ivhd_find_by_index(int idx) { struct find_ivrs_hdr_args fi; fi.i = idx; fi.ptr = NULL; ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); return ((ACPI_IVRS_HARDWARE *)fi.ptr); } static void ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, uint32_t end_id, uint8_t cfg, bool ats) { struct ivhd_dev_cfg *dev_cfg; /* If device doesn't have special data, don't add it. */ if (!cfg) return; dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; dev_cfg->start_id = start_id; dev_cfg->end_id = end_id; dev_cfg->data = cfg; dev_cfg->enable_ats = ats; } /* * Record device attributes as suggested by BIOS. */ static int ivhd_dev_parse(ACPI_IVRS_HARDWARE * ivhd, struct amdvi_softc *softc) { - ACPI_IVRS_DE_HEADER *de, *end; + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; int range_start_id = 0, range_end_id = 0; uint32_t *extended; uint8_t all_data = 0, range_data = 0; bool range_enable_ats = false, enable_ats; softc->start_dev_rid = ~0; softc->end_dev_rid = 0; - de = (ACPI_IVRS_DE_HEADER *) ((uint8_t *)ivhd + - sizeof(ACPI_IVRS_HARDWARE)); - end = (ACPI_IVRS_DE_HEADER *) ((uint8_t *)ivhd + - ivhd->Header.Length); + /* + * XXX The following actually depends on Header.Type and + * is only true for 0x10. + */ + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE); + end = (uint8_t *)ivhd + ivhd->Header.Length; - while (de < (ACPI_IVRS_DE_HEADER *) end) { + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id); softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id); switch (de->Type) { case ACPI_IVRS_TYPE_ALL: all_data = de->DataSetting; break; case ACPI_IVRS_TYPE_SELECT: case ACPI_IVRS_TYPE_ALIAS_SELECT: case ACPI_IVRS_TYPE_EXT_SELECT: enable_ats = false; if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { extended = (uint32_t *)(de + 1); enable_ats = (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? false : true; } ivhd_dev_add_entry(softc, de->Id, de->Id, de->DataSetting | all_data, enable_ats); break; case ACPI_IVRS_TYPE_START: case ACPI_IVRS_TYPE_ALIAS_START: case ACPI_IVRS_TYPE_EXT_START: range_start_id = de->Id; range_data = de->DataSetting; if (de->Type == ACPI_IVRS_TYPE_EXT_START) { extended = (uint32_t *)(de + 1); range_enable_ats = (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? false : true; } break; case ACPI_IVRS_TYPE_END: range_end_id = de->Id; ivhd_dev_add_entry(softc, range_start_id, range_end_id, range_data | all_data, range_enable_ats); range_start_id = range_end_id = 0; range_data = 0; all_data = 0; break; case ACPI_IVRS_TYPE_PAD4: break; case ACPI_IVRS_TYPE_SPECIAL: /* HPET or IOAPIC */ break; default: if ((de->Type < 5) || (de->Type >= ACPI_IVRS_TYPE_PAD8)) device_printf(softc->dev, "Unknown dev entry:0x%x\n", de->Type); } if (softc->dev_cfg_cnt > (sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) { device_printf(softc->dev, "WARN Too many device entries.\n"); return (EINVAL); } - de++; + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } } KASSERT((softc->end_dev_rid >= softc->start_dev_rid), ("Device end[0x%x] < start[0x%x.\n", softc->end_dev_rid, softc->start_dev_rid)); return (0); } static void ivhd_identify(driver_t *driver, device_t parent) { ACPI_TABLE_IVRS *ivrs; ACPI_IVRS_HARDWARE *ivhd; ACPI_STATUS status; uint32_t info; int i, count = 0; if (acpi_disabled("ivhd")) return; status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); if (ACPI_FAILURE(status)) return; if (ivrs->Header.Length == 0) { return; } info = ivrs->Info; printf("AMD-Vi IVRS VAsize = %d PAsize = %d GVAsize = %d flags:%b\n", REG_BITS(info, 21, 15), REG_BITS(info, 14, 8), REG_BITS(info, 7, 5), REG_BITS(info, 22, 22), "\020\001HtAtsResv"); ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL); if (!ivhd_count) return; ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, M_WAITOK | M_ZERO); for (i = 0; i < ivhd_count; i++) { ivhd = ivhd_find_by_index(i); if (ivhd == NULL) { printf("Can't find IVHD entry%d\n", i); continue; } ivhd_devs[i] = BUS_ADD_CHILD(parent, 1, "ivhd", i); /* * XXX: In case device was not destroyed before, add will fail. * locate the old device instance. */ if (ivhd_devs[i] == NULL) { ivhd_devs[i] = device_find_child(parent, "ivhd", i); if (ivhd_devs[i] == NULL) { printf("AMD-Vi: cant find AMD-Vi dev%d\n", i); break; } } count++; } /* * Update device count in case failed to attach. */ ivhd_count = count; } static int ivhd_probe(device_t dev) { if (acpi_get_handle(dev) != NULL) return (ENXIO); device_set_desc(dev, "AMD-Vi/IOMMU or ivhd"); return (BUS_PROBE_NOWILDCARD); } static int ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd) { device_t dev; int max_ptp_level; dev = softc->dev; device_printf(dev, "Flag:%b\n", softc->ivhd_flag, "\020\001HtTunEn\002PassPW\003ResPassPW\004Isoc\005IotlbSup" "\006Coherent\007PreFSup\008PPRSup"); /* * If no extended feature[EFR], its rev1 with maximum paging level as 7. */ max_ptp_level = 7; if (softc->ivhd_efr) { device_printf(dev, "EFR HATS = %d GATS = %d GLXSup = %d " "MsiNumPr = %d PNBanks= %d PNCounters= %d\n" "max PASID = %d EFR: %b \n", REG_BITS(softc->ivhd_efr, 31, 30), REG_BITS(softc->ivhd_efr, 29, 28), REG_BITS(softc->ivhd_efr, 4, 3), REG_BITS(softc->ivhd_efr, 27, 23), REG_BITS(softc->ivhd_efr, 22, 17), REG_BITS(softc->ivhd_efr, 16, 13), REG_BITS(softc->ivhd_efr, 12, 8), softc->ivhd_efr, "\020\001XTSup\002NXSup\003GTSup\005IASup" "\006GASup\007HESup\008PPRSup"); max_ptp_level = REG_BITS(softc->ivhd_efr, 31, 30) + 4; } /* Make sure device support minimum page level as requested by user. */ if (max_ptp_level < amdvi_ptp_level) { device_printf(dev, "Insufficient PTP level:%d\n", max_ptp_level); return (EINVAL); } device_printf(softc->dev, "max supported paging level:%d restricting to: %d\n", max_ptp_level, amdvi_ptp_level); device_printf(softc->dev, "device supported range " "[0x%x - 0x%x]\n", softc->start_dev_rid, softc->end_dev_rid); return (0); } static int ivhd_attach(device_t dev) { ACPI_IVRS_HARDWARE *ivhd; struct amdvi_softc *softc; int status, unit; unit = device_get_unit(dev); /* Make sure its same device for which attach is called. */ if (ivhd_devs[unit] != dev) panic("Not same device old %p new %p", ivhd_devs[unit], dev); softc = device_get_softc(dev); softc->dev = dev; ivhd = ivhd_find_by_index(unit); if (ivhd == NULL) return (EINVAL); softc->pci_seg = ivhd->PciSegmentGroup; softc->pci_rid = ivhd->Header.DeviceId; softc->ivhd_flag = ivhd->Header.Flags; softc->ivhd_efr = ivhd->Reserved; /* * PCI capability has more capabilities that are not part of IVRS. */ softc->cap_off = ivhd->CapabilityOffset; #ifdef notyet /* IVHD Info bit[4:0] is event MSI/X number. */ softc->event_msix = ivhd->Info & 0x1F; #endif softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); status = ivhd_dev_parse(ivhd, softc); if (status != 0) { device_printf(dev, "endpoint device parsing error=%d\n", status); } status = ivhd_print_cap(softc, ivhd); if (status != 0) { return (status); } status = amdvi_setup_hw(softc); if (status != 0) { device_printf(dev, "couldn't be initialised, error=%d\n", status); return (status); } return (0); } static int ivhd_detach(device_t dev) { struct amdvi_softc *softc; softc = device_get_softc(dev); amdvi_teardown_hw(softc); /* * XXX: delete the device. * don't allow detach, return EBUSY. */ return (0); } static int ivhd_suspend(device_t dev) { return (0); } static int ivhd_resume(device_t dev) { return (0); } static device_method_t ivhd_methods[] = { DEVMETHOD(device_identify, ivhd_identify), DEVMETHOD(device_probe, ivhd_probe), DEVMETHOD(device_attach, ivhd_attach), DEVMETHOD(device_detach, ivhd_detach), DEVMETHOD(device_suspend, ivhd_suspend), DEVMETHOD(device_resume, ivhd_resume), DEVMETHOD_END }; static driver_t ivhd_driver = { "ivhd", ivhd_methods, sizeof(struct amdvi_softc), }; static devclass_t ivhd_devclass; /* * Load this module at the end after PCI re-probing to configure interrupt. */ DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, SI_ORDER_ANY); MODULE_DEPEND(ivhd, acpi, 1, 1, 1); MODULE_DEPEND(ivhd, pci, 1, 1, 1); Index: projects/bsd_rdma_4_9/sys/arm/allwinner/clkng/aw_ccung.c =================================================================== --- projects/bsd_rdma_4_9/sys/arm/allwinner/clkng/aw_ccung.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/arm/allwinner/clkng/aw_ccung.c (revision 326162) @@ -1,412 +1,428 @@ /*- * Copyright (c) 2017 Emmanuel Vadot * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Allwinner Clock Control Unit */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __aarch64__ #include "opt_soc.h" #endif #if defined(SOC_ALLWINNER_A13) #include #endif #if defined(SOC_ALLWINNER_A31) #include #endif #if defined(SOC_ALLWINNER_A64) #include #include #endif #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5) #include #include #endif +#if defined(SOC_ALLWINNER_A83T) +#include +#endif + #include "clkdev_if.h" #include "hwreset_if.h" static struct resource_spec aw_ccung_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { -1, 0 } }; #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5) #define H3_CCU 1 #define H3_R_CCU 2 #endif #if defined(SOC_ALLWINNER_A31) #define A31_CCU 3 #endif #if defined(SOC_ALLWINNER_A64) #define A64_CCU 4 #define A64_R_CCU 5 #endif #if defined(SOC_ALLWINNER_A13) #define A13_CCU 6 #endif +#if defined(SOC_ALLWINNER_A83T) +#define A83T_CCU 7 +#endif + static struct ofw_compat_data compat_data[] = { #if defined(SOC_ALLWINNER_A31) { "allwinner,sun5i-a13-ccu", A13_CCU}, #endif #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5) { "allwinner,sun8i-h3-ccu", H3_CCU }, { "allwinner,sun8i-h3-r-ccu", H3_R_CCU }, #endif #if defined(SOC_ALLWINNER_A31) { "allwinner,sun6i-a31-ccu", A31_CCU }, #endif #if defined(SOC_ALLWINNER_A64) { "allwinner,sun50i-a64-ccu", A64_CCU }, { "allwinner,sun50i-a64-r-ccu", A64_R_CCU }, #endif +#if defined(SOC_ALLWINNER_A83T) + { "allwinner,sun8i-a83t-ccu", A83T_CCU }, +#endif {NULL, 0 } }; #define CCU_READ4(sc, reg) bus_read_4((sc)->res, (reg)) #define CCU_WRITE4(sc, reg, val) bus_write_4((sc)->res, (reg), (val)) static int aw_ccung_write_4(device_t dev, bus_addr_t addr, uint32_t val) { struct aw_ccung_softc *sc; sc = device_get_softc(dev); CCU_WRITE4(sc, addr, val); return (0); } static int aw_ccung_read_4(device_t dev, bus_addr_t addr, uint32_t *val) { struct aw_ccung_softc *sc; sc = device_get_softc(dev); *val = CCU_READ4(sc, addr); return (0); } static int aw_ccung_modify_4(device_t dev, bus_addr_t addr, uint32_t clr, uint32_t set) { struct aw_ccung_softc *sc; uint32_t reg; sc = device_get_softc(dev); reg = CCU_READ4(sc, addr); reg &= ~clr; reg |= set; CCU_WRITE4(sc, addr, reg); return (0); } static int aw_ccung_reset_assert(device_t dev, intptr_t id, bool reset) { struct aw_ccung_softc *sc; uint32_t val; sc = device_get_softc(dev); if (id >= sc->nresets || sc->resets[id].offset == 0) return (0); mtx_lock(&sc->mtx); val = CCU_READ4(sc, sc->resets[id].offset); if (reset) val &= ~(1 << sc->resets[id].shift); else val |= 1 << sc->resets[id].shift; CCU_WRITE4(sc, sc->resets[id].offset, val); mtx_unlock(&sc->mtx); return (0); } static int aw_ccung_reset_is_asserted(device_t dev, intptr_t id, bool *reset) { struct aw_ccung_softc *sc; uint32_t val; sc = device_get_softc(dev); if (id >= sc->nresets || sc->resets[id].offset == 0) return (0); mtx_lock(&sc->mtx); val = CCU_READ4(sc, sc->resets[id].offset); *reset = (val & (1 << sc->resets[id].shift)) != 0 ? false : true; mtx_unlock(&sc->mtx); return (0); } static void aw_ccung_device_lock(device_t dev) { struct aw_ccung_softc *sc; sc = device_get_softc(dev); mtx_lock(&sc->mtx); } static void aw_ccung_device_unlock(device_t dev) { struct aw_ccung_softc *sc; sc = device_get_softc(dev); mtx_unlock(&sc->mtx); } static int aw_ccung_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) return (ENXIO); device_set_desc(dev, "Allwinner Clock Control Unit NG"); return (BUS_PROBE_DEFAULT); } static int aw_ccung_register_gates(struct aw_ccung_softc *sc) { struct clk_gate_def def; int i; for (i = 0; i < sc->ngates; i++) { if (sc->gates[i].name == NULL) continue; memset(&def, 0, sizeof(def)); def.clkdef.id = i; def.clkdef.name = sc->gates[i].name; def.clkdef.parent_names = &sc->gates[i].parent_name; def.clkdef.parent_cnt = 1; def.offset = sc->gates[i].offset; def.shift = sc->gates[i].shift; def.mask = 1; def.on_value = 1; def.off_value = 0; clknode_gate_register(sc->clkdom, &def); } return (0); } static void aw_ccung_init_clocks(struct aw_ccung_softc *sc) { struct clknode *clknode; int i, error; for (i = 0; i < sc->n_clk_init; i++) { clknode = clknode_find_by_name(sc->clk_init[i].name); if (clknode == NULL) { device_printf(sc->dev, "Cannot find clock %s\n", sc->clk_init[i].name); continue; } if (sc->clk_init[i].parent_name != NULL) { if (bootverbose) device_printf(sc->dev, "Setting %s as parent for %s\n", sc->clk_init[i].parent_name, sc->clk_init[i].name); error = clknode_set_parent_by_name(clknode, sc->clk_init[i].parent_name); if (error != 0) { device_printf(sc->dev, "Cannot set parent to %s for %s\n", sc->clk_init[i].parent_name, sc->clk_init[i].name); continue; } } if (sc->clk_init[i].default_freq != 0) { error = clknode_set_freq(clknode, sc->clk_init[i].default_freq, 0 , 0); if (error != 0) { device_printf(sc->dev, "Cannot set frequency for %s to %ju\n", sc->clk_init[i].name, sc->clk_init[i].default_freq); continue; } } if (sc->clk_init[i].enable) { error = clknode_enable(clknode); if (error != 0) { device_printf(sc->dev, "Cannot enable %s\n", sc->clk_init[i].name); continue; } } } } static int aw_ccung_attach(device_t dev) { struct aw_ccung_softc *sc; sc = device_get_softc(dev); sc->dev = dev; if (bus_alloc_resources(dev, aw_ccung_spec, &sc->res) != 0) { device_printf(dev, "cannot allocate resources for device\n"); return (ENXIO); } mtx_init(&sc->mtx, device_get_nameunit(dev), NULL, MTX_DEF); sc->type = ofw_bus_search_compatible(dev, compat_data)->ocd_data; sc->clkdom = clkdom_create(dev); if (sc->clkdom == NULL) panic("Cannot create clkdom\n"); switch (sc->type) { #if defined(SOC_ALLWINNER_A13) case A13_CCU: ccu_a13_register_clocks(sc); break; #endif #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5) case H3_CCU: ccu_h3_register_clocks(sc); break; case H3_R_CCU: ccu_sun8i_r_register_clocks(sc); break; #endif #if defined(SOC_ALLWINNER_A31) case A31_CCU: ccu_a31_register_clocks(sc); break; #endif #if defined(SOC_ALLWINNER_A64) case A64_CCU: ccu_a64_register_clocks(sc); break; case A64_R_CCU: ccu_sun8i_r_register_clocks(sc); + break; +#endif +#if defined(SOC_ALLWINNER_A83T) + case A83T_CCU: + ccu_a83t_register_clocks(sc); break; #endif } if (sc->gates) aw_ccung_register_gates(sc); if (clkdom_finit(sc->clkdom) != 0) panic("cannot finalize clkdom initialization\n"); clkdom_xlock(sc->clkdom); aw_ccung_init_clocks(sc); clkdom_unlock(sc->clkdom); if (bootverbose) clkdom_dump(sc->clkdom); /* If we have resets, register our self as a reset provider */ if (sc->resets) hwreset_register_ofw_provider(dev); return (0); } static device_method_t aw_ccung_methods[] = { /* Device interface */ DEVMETHOD(device_probe, aw_ccung_probe), DEVMETHOD(device_attach, aw_ccung_attach), /* clkdev interface */ DEVMETHOD(clkdev_write_4, aw_ccung_write_4), DEVMETHOD(clkdev_read_4, aw_ccung_read_4), DEVMETHOD(clkdev_modify_4, aw_ccung_modify_4), DEVMETHOD(clkdev_device_lock, aw_ccung_device_lock), DEVMETHOD(clkdev_device_unlock, aw_ccung_device_unlock), /* Reset interface */ DEVMETHOD(hwreset_assert, aw_ccung_reset_assert), DEVMETHOD(hwreset_is_asserted, aw_ccung_reset_is_asserted), DEVMETHOD_END }; static driver_t aw_ccung_driver = { "aw_ccung", aw_ccung_methods, sizeof(struct aw_ccung_softc), }; static devclass_t aw_ccung_devclass; EARLY_DRIVER_MODULE(aw_ccung, simplebus, aw_ccung_driver, aw_ccung_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE); MODULE_VERSION(aw_ccung, 1); Index: projects/bsd_rdma_4_9/sys/arm/arm/machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/arm/arm/machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/arm/arm/machdep.c (revision 326162) @@ -1,1243 +1,1247 @@ /* $NetBSD: arm32_machdep.c,v 1.44 2004/03/24 15:34:47 atatat Exp $ */ /*- * Copyright (c) 2004 Olivier Houchard * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Mark Brinicombe * for the NetBSD Project. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Machine dependent functions for kernel setup * * Created : 17/09/94 * Updated : 18/04/01 updated for new wscons */ #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_platform.h" #include "opt_sched.h" #include "opt_timer.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FDT #include #include #endif #ifdef DEBUG #define debugf(fmt, args...) printf(fmt, ##args) #else #define debugf(fmt, args...) #endif #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) || \ defined(COMPAT_FREEBSD9) #error FreeBSD/arm doesn't provide compatibility with releases prior to 10 #endif #if __ARM_ARCH >= 6 && !defined(INTRNG) #error armv6 requires INTRNG #endif struct pcpu __pcpu[MAXCPU]; struct pcpu *pcpup = &__pcpu[0]; static struct trapframe proc0_tf; uint32_t cpu_reset_address = 0; int cold = 1; vm_offset_t vector_page; int (*_arm_memcpy)(void *, void *, int, int) = NULL; int (*_arm_bzero)(void *, int, int) = NULL; int _min_memcpy_size = 0; int _min_bzero_size = 0; extern int *end; #ifdef FDT vm_paddr_t pmap_pa; #if __ARM_ARCH >= 6 vm_offset_t systempage; vm_offset_t irqstack; vm_offset_t undstack; vm_offset_t abtstack; #else /* * This is the number of L2 page tables required for covering max * (hypothetical) memsize of 4GB and all kernel mappings (vectors, msgbuf, * stacks etc.), uprounded to be divisible by 4. */ #define KERNEL_PT_MAX 78 static struct pv_addr kernel_pt_table[KERNEL_PT_MAX]; struct pv_addr systempage; static struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; static struct pv_addr kernelstack; #endif /* __ARM_ARCH >= 6 */ #endif /* FDT */ #ifdef PLATFORM static delay_func *delay_impl; static void *delay_arg; #endif struct kva_md_info kmi; /* * arm32_vector_init: * * Initialize the vector page, and select whether or not to * relocate the vectors. * * NOTE: We expect the vector page to be mapped at its expected * destination. */ extern unsigned int page0[], page0_data[]; void arm_vector_init(vm_offset_t va, int which) { unsigned int *vectors = (int *) va; unsigned int *vectors_data = vectors + (page0_data - page0); int vec; /* * Loop through the vectors we're taking over, and copy the * vector's insn and data word. */ for (vec = 0; vec < ARM_NVEC; vec++) { if ((which & (1 << vec)) == 0) { /* Don't want to take over this vector. */ continue; } vectors[vec] = page0[vec]; vectors_data[vec] = page0_data[vec]; } /* Now sync the vectors. */ icache_sync(va, (ARM_NVEC * 2) * sizeof(u_int)); vector_page = va; #if __ARM_ARCH < 6 if (va == ARM_VECTORS_HIGH) { /* * Enable high vectors in the system control reg (SCTLR). * * Assume the MD caller knows what it's doing here, and really * does want the vector page relocated. * * Note: This has to be done here (and not just in * cpu_setup()) because the vector page needs to be * accessible *before* cpu_startup() is called. * Think ddb(9) ... */ cpu_control(CPU_CONTROL_VECRELOC, CPU_CONTROL_VECRELOC); } #endif } static void cpu_startup(void *dummy) { struct pcb *pcb = thread0.td_pcb; const unsigned int mbyte = 1024 * 1024; #if __ARM_ARCH < 6 && !defined(ARM_CACHE_LOCK_ENABLE) vm_page_t m; #endif identify_arm_cpu(); vm_ksubmap_init(&kmi); /* * Display the RAM layout. */ printf("real memory = %ju (%ju MB)\n", (uintmax_t)arm32_ptob(realmem), (uintmax_t)arm32_ptob(realmem) / mbyte); printf("avail memory = %ju (%ju MB)\n", (uintmax_t)arm32_ptob(vm_cnt.v_free_count), (uintmax_t)arm32_ptob(vm_cnt.v_free_count) / mbyte); if (bootverbose) { arm_physmem_print_tables(); devmap_print_table(); } bufinit(); vm_pager_bufferinit(); pcb->pcb_regs.sf_sp = (u_int)thread0.td_kstack + USPACE_SVC_STACK_TOP; pmap_set_pcb_pagedir(kernel_pmap, pcb); #if __ARM_ARCH < 6 vector_page_setprot(VM_PROT_READ); pmap_postinit(); #ifdef ARM_CACHE_LOCK_ENABLE pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS); arm_lock_cache_line(ARM_TP_ADDRESS); #else m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_ZERO); pmap_kenter_user(ARM_TP_ADDRESS, VM_PAGE_TO_PHYS(m)); #endif *(uint32_t *)ARM_RAS_START = 0; *(uint32_t *)ARM_RAS_END = 0xffffffff; #endif } SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { dcache_wb_poc((vm_offset_t)ptr, (vm_paddr_t)vtophys(ptr), len); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { return (ENXIO); } void cpu_idle(int busy) { CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); spinlock_enter(); #ifndef NO_EVENTTIMERS if (!busy) cpu_idleclock(); #endif if (!sched_runnable()) cpu_sleep(0); #ifndef NO_EVENTTIMERS if (!busy) cpu_activeclock(); #endif spinlock_exit(); CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } int cpu_idle_wakeup(int cpu) { return (0); } #ifdef NO_EVENTTIMERS /* * Most ARM platforms don't need to do anything special to init their clocks * (they get intialized during normal device attachment), and by not defining a * cpu_initclocks() function they get this generic one. Any platform that needs * to do something special can just provide their own implementation, which will * override this one due to the weak linkage. */ void arm_generic_initclocks(void) { } __weak_reference(arm_generic_initclocks, cpu_initclocks); #else void cpu_initclocks(void) { #ifdef SMP if (PCPU_GET(cpuid) == 0) cpu_initclocks_bsp(); else cpu_initclocks_ap(); #else cpu_initclocks_bsp(); #endif } #endif #ifdef PLATFORM void arm_set_delay(delay_func *impl, void *arg) { KASSERT(impl != NULL, ("No DELAY implementation")); delay_impl = impl; delay_arg = arg; } void DELAY(int usec) { delay_impl(usec, delay_arg); } #endif void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { } void spinlock_enter(void) { struct thread *td; register_t cspr; td = curthread; if (td->td_md.md_spinlock_count == 0) { cspr = disable_interrupts(PSR_I | PSR_F); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_cspr = cspr; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t cspr; td = curthread; critical_exit(); cspr = td->td_md.md_saved_cspr; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) restore_interrupts(cspr); } /* * Clear registers on exec */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf = td->td_frame; memset(tf, 0, sizeof(*tf)); tf->tf_usr_sp = stack; tf->tf_usr_lr = imgp->entry_addr; tf->tf_svc_lr = 0x77777777; tf->tf_pc = imgp->entry_addr; tf->tf_spsr = PSR_USR32_MODE; } #ifdef VFP /* * Get machine VFP context. */ void get_vfpcontext(struct thread *td, mcontext_vfp_t *vfp) { struct pcb *pcb; pcb = td->td_pcb; if (td == curthread) { critical_enter(); vfp_store(&pcb->pcb_vfpstate, false); critical_exit(); } else MPASS(TD_IS_SUSPENDED(td)); memcpy(vfp->mcv_reg, pcb->pcb_vfpstate.reg, sizeof(vfp->mcv_reg)); vfp->mcv_fpscr = pcb->pcb_vfpstate.fpscr; } /* * Set machine VFP context. */ void set_vfpcontext(struct thread *td, mcontext_vfp_t *vfp) { struct pcb *pcb; pcb = td->td_pcb; if (td == curthread) { critical_enter(); vfp_discard(td); critical_exit(); } else MPASS(TD_IS_SUSPENDED(td)); memcpy(pcb->pcb_vfpstate.reg, vfp->mcv_reg, sizeof(pcb->pcb_vfpstate.reg)); pcb->pcb_vfpstate.fpscr = vfp->mcv_fpscr; } #endif int arm_get_vfpstate(struct thread *td, void *args) { int rv; struct arm_get_vfpstate_args ua; mcontext_vfp_t mcontext_vfp; rv = copyin(args, &ua, sizeof(ua)); if (rv != 0) return (rv); if (ua.mc_vfp_size != sizeof(mcontext_vfp_t)) return (EINVAL); #ifdef VFP get_vfpcontext(td, &mcontext_vfp); #else bzero(&mcontext_vfp, sizeof(mcontext_vfp)); #endif rv = copyout(&mcontext_vfp, ua.mc_vfp, sizeof(mcontext_vfp)); if (rv != 0) return (rv); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret) { struct trapframe *tf = td->td_frame; __greg_t *gr = mcp->__gregs; if (clear_ret & GET_MC_CLEAR_RET) { gr[_REG_R0] = 0; gr[_REG_CPSR] = tf->tf_spsr & ~PSR_C; } else { gr[_REG_R0] = tf->tf_r0; gr[_REG_CPSR] = tf->tf_spsr; } gr[_REG_R1] = tf->tf_r1; gr[_REG_R2] = tf->tf_r2; gr[_REG_R3] = tf->tf_r3; gr[_REG_R4] = tf->tf_r4; gr[_REG_R5] = tf->tf_r5; gr[_REG_R6] = tf->tf_r6; gr[_REG_R7] = tf->tf_r7; gr[_REG_R8] = tf->tf_r8; gr[_REG_R9] = tf->tf_r9; gr[_REG_R10] = tf->tf_r10; gr[_REG_R11] = tf->tf_r11; gr[_REG_R12] = tf->tf_r12; gr[_REG_SP] = tf->tf_usr_sp; gr[_REG_LR] = tf->tf_usr_lr; gr[_REG_PC] = tf->tf_pc; mcp->mc_vfp_size = 0; mcp->mc_vfp_ptr = NULL; memset(&mcp->mc_spare, 0, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { mcontext_vfp_t mc_vfp, *vfp; struct trapframe *tf = td->td_frame; const __greg_t *gr = mcp->__gregs; + int spsr; + /* + * Make sure the processor mode has not been tampered with and + * interrupts have not been disabled. + */ + spsr = gr[_REG_CPSR]; + if ((spsr & PSR_MODE) != PSR_USR32_MODE || + (spsr & (PSR_I | PSR_F)) != 0) + return (EINVAL); + #ifdef WITNESS if (mcp->mc_vfp_size != 0 && mcp->mc_vfp_size != sizeof(mc_vfp)) { printf("%s: %s: Malformed mc_vfp_size: %d (0x%08X)\n", td->td_proc->p_comm, __func__, mcp->mc_vfp_size, mcp->mc_vfp_size); } else if (mcp->mc_vfp_size != 0 && mcp->mc_vfp_ptr == NULL) { printf("%s: %s: c_vfp_size != 0 but mc_vfp_ptr == NULL\n", td->td_proc->p_comm, __func__); } #endif if (mcp->mc_vfp_size == sizeof(mc_vfp) && mcp->mc_vfp_ptr != NULL) { if (copyin(mcp->mc_vfp_ptr, &mc_vfp, sizeof(mc_vfp)) != 0) return (EFAULT); vfp = &mc_vfp; } else { vfp = NULL; } tf->tf_r0 = gr[_REG_R0]; tf->tf_r1 = gr[_REG_R1]; tf->tf_r2 = gr[_REG_R2]; tf->tf_r3 = gr[_REG_R3]; tf->tf_r4 = gr[_REG_R4]; tf->tf_r5 = gr[_REG_R5]; tf->tf_r6 = gr[_REG_R6]; tf->tf_r7 = gr[_REG_R7]; tf->tf_r8 = gr[_REG_R8]; tf->tf_r9 = gr[_REG_R9]; tf->tf_r10 = gr[_REG_R10]; tf->tf_r11 = gr[_REG_R11]; tf->tf_r12 = gr[_REG_R12]; tf->tf_usr_sp = gr[_REG_SP]; tf->tf_usr_lr = gr[_REG_LR]; tf->tf_pc = gr[_REG_PC]; tf->tf_spsr = gr[_REG_CPSR]; #ifdef VFP if (vfp != NULL) set_vfpcontext(td, vfp); #endif return (0); } void sendsig(catcher, ksi, mask) sig_t catcher; ksiginfo_t *ksi; sigset_t *mask; { struct thread *td; struct proc *p; struct trapframe *tf; struct sigframe *fp, frame; struct sigacts *psp; struct sysentvec *sysent; int onstack; int sig; int code; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_usr_sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !(onstack) && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct sigframe *)td->td_frame->tf_usr_sp; /* make room on the stack */ fp--; /* make the stack aligned */ fp = (struct sigframe *)STACKALIGN(fp); /* Populate the siginfo frame. */ get_mcontext(td, &frame.sf_uc.uc_mcontext, 0); #ifdef VFP get_vfpcontext(td, &frame.sf_vfp); frame.sf_uc.uc_mcontext.mc_vfp_size = sizeof(fp->sf_vfp); frame.sf_uc.uc_mcontext.mc_vfp_ptr = &fp->sf_vfp; #else frame.sf_uc.uc_mcontext.mc_vfp_size = 0; frame.sf_uc.uc_mcontext.mc_vfp_ptr = NULL; #endif frame.sf_si = ksi->ksi_info; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK ) ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE; frame.sf_uc.uc_stack = td->td_sigstk; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Copy the sigframe out to the user's stack. */ if (copyout(&frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. We invoke the handler * directly, only returning via the trampoline. Note the * trampoline version numbers are coordinated with machine- * dependent code in libc. */ tf->tf_r0 = sig; tf->tf_r1 = (register_t)&fp->sf_si; tf->tf_r2 = (register_t)&fp->sf_uc; /* the trampoline uses r5 as the uc address */ tf->tf_r5 = (register_t)&fp->sf_uc; tf->tf_pc = (register_t)catcher; tf->tf_usr_sp = (register_t)fp; sysent = p->p_sysent; if (sysent->sv_sigcode_base != 0) tf->tf_usr_lr = (register_t)sysent->sv_sigcode_base; else tf->tf_usr_lr = (register_t)(sysent->sv_psstrings - *(sysent->sv_szsigcode)); /* Set the mode to enter in the signal handler */ #if __ARM_ARCH >= 7 if ((register_t)catcher & 1) tf->tf_spsr |= PSR_T; else tf->tf_spsr &= ~PSR_T; #endif CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_usr_lr, tf->tf_usr_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; - int spsr; + int error; if (uap == NULL) return (EFAULT); if (copyin(uap->sigcntxp, &uc, sizeof(uc))) return (EFAULT); - /* - * Make sure the processor mode has not been tampered with and - * interrupts have not been disabled. - */ - spsr = uc.uc_mcontext.__gregs[_REG_CPSR]; - if ((spsr & PSR_MODE) != PSR_USR32_MODE || - (spsr & (PSR_I | PSR_F)) != 0) - return (EINVAL); /* Restore register context. */ - set_mcontext(td, &uc.uc_mcontext); + error = set_mcontext(td, &uc.uc_mcontext); + if (error != 0) + return (error); /* Restore signal mask. */ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_regs.sf_r4 = tf->tf_r4; pcb->pcb_regs.sf_r5 = tf->tf_r5; pcb->pcb_regs.sf_r6 = tf->tf_r6; pcb->pcb_regs.sf_r7 = tf->tf_r7; pcb->pcb_regs.sf_r8 = tf->tf_r8; pcb->pcb_regs.sf_r9 = tf->tf_r9; pcb->pcb_regs.sf_r10 = tf->tf_r10; pcb->pcb_regs.sf_r11 = tf->tf_r11; pcb->pcb_regs.sf_r12 = tf->tf_r12; pcb->pcb_regs.sf_pc = tf->tf_pc; pcb->pcb_regs.sf_lr = tf->tf_usr_lr; pcb->pcb_regs.sf_sp = tf->tf_usr_sp; } void pcpu0_init(void) { #if __ARM_ARCH >= 6 set_curthread(&thread0); #endif pcpu_init(pcpup, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); } /* * Initialize proc0 */ void init_proc0(vm_offset_t kstack) { proc_linkup0(&proc0, &thread0); thread0.td_kstack = kstack; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + kstack_pages * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_vfpcpu = -1; thread0.td_pcb->pcb_vfpstate.fpscr = VFPSCR_DN; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; } #if __ARM_ARCH >= 6 void set_stackptrs(int cpu) { set_stackptr(PSR_IRQ32_MODE, irqstack + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); set_stackptr(PSR_ABT32_MODE, abtstack + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); set_stackptr(PSR_UND32_MODE, undstack + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); } #else void set_stackptrs(int cpu) { set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); set_stackptr(PSR_UND32_MODE, undstack.pv_va + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); } #endif #ifdef FDT #if __ARM_ARCH < 6 void * initarm(struct arm_boot_params *abp) { struct mem_region mem_regions[FDT_MEM_REGIONS]; struct pv_addr kernel_l1pt; struct pv_addr dpcpu; vm_offset_t dtbp, freemempos, l2_start, lastaddr; uint64_t memsize; uint32_t l2size; char *env; void *kmdp; u_int l1pagetable; int i, j, err_devmap, mem_regions_sz; lastaddr = parse_boot_param(abp); arm_physmem_kernaddr = abp->abp_physaddr; memsize = 0; cpuinfo_init(); set_cpufuncs(); /* * Find the dtb passed in by the boot loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); else dtbp = (vm_offset_t)NULL; #if defined(FDT_DTB_STATIC) /* * In case the device tree blob was not retrieved (from metadata) try * to use the statically embedded one. */ if (dtbp == (vm_offset_t)NULL) dtbp = (vm_offset_t)&fdt_static_dtb; #endif if (OF_install(OFW_FDT, 0) == FALSE) panic("Cannot install FDT"); if (OF_init((void *)dtbp) != 0) panic("OF_init failed with the found device tree"); /* Grab physical memory regions information from device tree. */ if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, &memsize) != 0) panic("Cannot get physical memory regions"); arm_physmem_hardware_regions(mem_regions, mem_regions_sz); /* Grab reserved memory regions information from device tree. */ if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0) arm_physmem_exclude_regions(mem_regions, mem_regions_sz, EXFLAG_NODUMP | EXFLAG_NOALLOC); /* Platform-specific initialisation */ platform_probe_and_attach(); pcpu0_init(); /* Do basic tuning, hz etc */ init_param1(); /* Calculate number of L2 tables needed for mapping vm_page_array */ l2size = (memsize / PAGE_SIZE) * sizeof(struct vm_page); l2size = (l2size >> L1_S_SHIFT) + 1; /* * Add one table for end of kernel map, one for stacks, msgbuf and * L1 and L2 tables map, one for vectors map and two for * l2 structures from pmap_bootstrap. */ l2size += 5; /* Make it divisible by 4 */ l2size = (l2size + 3) & ~3; freemempos = (lastaddr + PAGE_MASK) & ~PAGE_MASK; /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_va, (np)); \ (var).pv_pa = (var).pv_va + (abp->abp_physaddr - KERNVIRTADDR); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0) freemempos += PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); for (i = 0, j = 0; i < l2size; ++i) { if (!(i % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[i], L2_TABLE_SIZE / PAGE_SIZE); j = i; } else { kernel_pt_table[i].pv_va = kernel_pt_table[j].pv_va + L2_TABLE_SIZE_REAL * (i - j); kernel_pt_table[i].pv_pa = kernel_pt_table[i].pv_va - KERNVIRTADDR + abp->abp_physaddr; } } /* * Allocate a page for the system page mapped to 0x00000000 * or 0xffff0000. This page will just contain the system vectors * and can be shared by all processes. */ valloc_pages(systempage, 1); /* Allocate dynamic per-cpu area. */ valloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu.pv_va, 0); /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE * MAXCPU); valloc_pages(abtstack, ABT_STACK_SIZE * MAXCPU); valloc_pages(undstack, UND_STACK_SIZE * MAXCPU); valloc_pages(kernelstack, kstack_pages * MAXCPU); valloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_va; /* * Try to map as much as possible of kernel text and data using * 1MB section mapping and for the rest of initial kernel address * space use L2 coarse tables. * * Link L2 tables for mapping remainder of kernel (modulo 1MB) * and kernel structures */ l2_start = lastaddr & ~(L1_S_OFFSET); for (i = 0 ; i < l2size - 1; i++) pmap_link_l2pt(l1pagetable, l2_start + i * L1_S_SIZE, &kernel_pt_table[i]); pmap_curmaxkvaddr = l2_start + (l2size - 1) * L1_S_SIZE; /* Map kernel code and data */ pmap_map_chunk(l1pagetable, KERNVIRTADDR, abp->abp_physaddr, (((uint32_t)(lastaddr) - KERNVIRTADDR) + PAGE_MASK) & ~PAGE_MASK, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); /* Map L1 directory and allocated L2 page tables */ pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa, L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); pmap_map_chunk(l1pagetable, kernel_pt_table[0].pv_va, kernel_pt_table[0].pv_pa, L2_TABLE_SIZE_REAL * l2size, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); /* Map allocated DPCPU, stacks and msgbuf */ pmap_map_chunk(l1pagetable, dpcpu.pv_va, dpcpu.pv_pa, freemempos - dpcpu.pv_va, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); /* Link and map the vector page */ pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH, &kernel_pt_table[l2size - 1]); pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, PTE_CACHE); /* Establish static device mappings. */ err_devmap = platform_devmap_init(); devmap_bootstrap(l1pagetable, NULL); vm_max_kernel_address = platform_lastaddr(); cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) | DOMAIN_CLIENT); pmap_pa = kernel_l1pt.pv_pa; cpu_setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)); /* * Now that proper page tables are installed, call cpu_setup() to enable * instruction and data caches and other chip-specific features. */ cpu_setup(); /* * Only after the SOC registers block is mapped we can perform device * tree fixups, as they may attempt to read parameters from hardware. */ OF_interpret("perform-fixup", 0); platform_gpio_init(); cninit(); debugf("initarm: console initialized\n"); debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp); debugf(" boothowto = 0x%08x\n", boothowto); debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp); arm_print_kenv(); env = kern_getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } if (err_devmap != 0) printf("WARNING: could not fully configure devmap, error=%d\n", err_devmap); platform_late_init(); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ cpu_control(CPU_CONTROL_MMU_ENABLE, CPU_CONTROL_MMU_ENABLE); set_stackptrs(0); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in cpu_setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross relocations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); undefined_init(); init_proc0(kernelstack.pv_va); arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); pmap_bootstrap(freemempos, &kernel_l1pt); msgbufp = (void *)msgbufpv.pv_va; msgbufinit(msgbufp, msgbufsize); mutex_init(); /* * Exclude the kernel (and all the things we allocated which immediately * follow the kernel) from the VM allocation pool but not from crash * dumps. virtual_avail is a global variable which tracks the kva we've * "allocated" while setting up pmaps. * * Prepare the list of physical memory available to the vm subsystem. */ arm_physmem_exclude_region(abp->abp_physaddr, (virtual_avail - KERNVIRTADDR), EXFLAG_NOALLOC); arm_physmem_init_kernel_globals(); init_param2(physmem); dbg_monitor_init(); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } #else /* __ARM_ARCH < 6 */ void * initarm(struct arm_boot_params *abp) { struct mem_region mem_regions[FDT_MEM_REGIONS]; vm_paddr_t lastaddr; vm_offset_t dtbp, kernelstack, dpcpu; char *env; void *kmdp; int err_devmap, mem_regions_sz; #ifdef EFI struct efi_map_header *efihdr; #endif /* get last allocated physical address */ arm_physmem_kernaddr = abp->abp_physaddr; lastaddr = parse_boot_param(abp) - KERNVIRTADDR + arm_physmem_kernaddr; set_cpufuncs(); cpuinfo_init(); /* * Find the dtb passed in by the boot loader. */ kmdp = preload_search_by_type("elf kernel"); dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); #if defined(FDT_DTB_STATIC) /* * In case the device tree blob was not retrieved (from metadata) try * to use the statically embedded one. */ if (dtbp == (vm_offset_t)NULL) dtbp = (vm_offset_t)&fdt_static_dtb; #endif if (OF_install(OFW_FDT, 0) == FALSE) panic("Cannot install FDT"); if (OF_init((void *)dtbp) != 0) panic("OF_init failed with the found device tree"); #if defined(LINUX_BOOT_ABI) arm_parse_fdt_bootargs(); #endif #ifdef EFI efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr != NULL) { arm_add_efi_map_entries(efihdr, mem_regions, &mem_regions_sz); } else #endif { /* Grab physical memory regions information from device tree. */ if (fdt_get_mem_regions(mem_regions, &mem_regions_sz,NULL) != 0) panic("Cannot get physical memory regions"); } arm_physmem_hardware_regions(mem_regions, mem_regions_sz); /* Grab reserved memory regions information from device tree. */ if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0) arm_physmem_exclude_regions(mem_regions, mem_regions_sz, EXFLAG_NODUMP | EXFLAG_NOALLOC); /* * Set TEX remapping registers. * Setup kernel page tables and switch to kernel L1 page table. */ pmap_set_tex(); pmap_bootstrap_prepare(lastaddr); /* * Now that proper page tables are installed, call cpu_setup() to enable * instruction and data caches and other chip-specific features. */ cpu_setup(); /* Platform-specific initialisation */ platform_probe_and_attach(); pcpu0_init(); /* Do basic tuning, hz etc */ init_param1(); /* * Allocate a page for the system page mapped to 0xffff0000 * This page will just contain the system vectors and can be * shared by all processes. */ systempage = pmap_preboot_get_pages(1); /* Map the vector page. */ pmap_preboot_map_pages(systempage, ARM_VECTORS_HIGH, 1); if (virtual_end >= ARM_VECTORS_HIGH) virtual_end = ARM_VECTORS_HIGH - 1; /* Allocate dynamic per-cpu area. */ dpcpu = pmap_preboot_get_vpages(DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate stacks for all modes */ irqstack = pmap_preboot_get_vpages(IRQ_STACK_SIZE * MAXCPU); abtstack = pmap_preboot_get_vpages(ABT_STACK_SIZE * MAXCPU); undstack = pmap_preboot_get_vpages(UND_STACK_SIZE * MAXCPU ); kernelstack = pmap_preboot_get_vpages(kstack_pages * MAXCPU); /* Allocate message buffer. */ msgbufp = (void *)pmap_preboot_get_vpages( round_page(msgbufsize) / PAGE_SIZE); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ set_stackptrs(0); mutex_init(); /* Establish static device mappings. */ err_devmap = platform_devmap_init(); devmap_bootstrap(0, NULL); vm_max_kernel_address = platform_lastaddr(); /* * Only after the SOC registers block is mapped we can perform device * tree fixups, as they may attempt to read parameters from hardware. */ OF_interpret("perform-fixup", 0); platform_gpio_init(); cninit(); debugf("initarm: console initialized\n"); debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp); debugf(" boothowto = 0x%08x\n", boothowto); debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp); debugf(" lastaddr1: 0x%08x\n", lastaddr); arm_print_kenv(); env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); if (err_devmap != 0) printf("WARNING: could not fully configure devmap, error=%d\n", err_devmap); platform_late_init(); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in cpu_setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross relocations of the kernel thus * this problem will not occur after initarm(). */ /* Set stack for exception handlers */ undefined_init(); init_proc0(kernelstack); arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); enable_interrupts(PSR_A); pmap_bootstrap(0); /* Exclude the kernel (and all the things we allocated which immediately * follow the kernel) from the VM allocation pool but not from crash * dumps. virtual_avail is a global variable which tracks the kva we've * "allocated" while setting up pmaps. * * Prepare the list of physical memory available to the vm subsystem. */ arm_physmem_exclude_region(abp->abp_physaddr, pmap_preboot_get_pages(0) - abp->abp_physaddr, EXFLAG_NOALLOC); arm_physmem_init_kernel_globals(); init_param2(physmem); /* Init message buffer. */ msgbufinit(msgbufp, msgbufsize); dbg_monitor_init(); kdb_init(); return ((void *)STACKALIGN(thread0.td_pcb)); } #endif /* __ARM_ARCH < 6 */ #endif /* FDT */ Index: projects/bsd_rdma_4_9/sys/arm/cloudabi32/cloudabi32_sysvec.c =================================================================== --- projects/bsd_rdma_4_9/sys/arm/cloudabi32/cloudabi32_sysvec.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/arm/cloudabi32/cloudabi32_sysvec.c (revision 326162) @@ -1,197 +1,197 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB and the auxiliary * vector. Let r0 point to the auxiliary vector, and set * tpidrurw to the TCB. */ regs = td->td_frame; - regs->tf_r0 = td->td_retval[0] = + regs->tf_r0 = stack + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, (void *)stack); } static int cloudabi32_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int error; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_r12; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* Fetch system call arguments from registers and the stack. */ sa->args[0] = frame->tf_r0; sa->args[1] = frame->tf_r1; sa->args[2] = frame->tf_r2; sa->args[3] = frame->tf_r3; if (sa->narg > 4) { error = copyin((void *)td->td_frame->tf_usr_sp, &sa->args[4], (sa->narg - 4) * sizeof(register_t)); if (error != 0) return (error); } /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_r1; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_r0 = td->td_retval[0]; frame->tf_r1 = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* Restart system call. */ frame->tf_pc -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_r0 = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* * Initial register values for processes returning from fork. * Make sure that we only set these values when forking, not * when creating a new thread. */ if ((td->td_pflags & TDP_FORKING) != 0) { frame->tf_r0 = CLOUDABI_PROCESS_CHILD; frame->tf_r1 = td->td_tid; } } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { struct trapframe *frame; stack_t stack; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_r0 = td->td_tid; frame->tf_r1 = attr->argument; /* Set up TLS. */ return (cpu_set_user_tls(td, (void *)tcb)); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_ARM, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; Index: projects/bsd_rdma_4_9/sys/arm64/arm64/machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/arm64/arm64/machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/arm64/arm64/machdep.c (revision 326162) @@ -1,1180 +1,1179 @@ /*- * Copyright (c) 2014 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include "opt_acpi.h" #include "opt_platform.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif #ifdef DEV_ACPI #include #include #endif #ifdef FDT #include #include #endif enum arm64_bus arm64_bus_method = ARM64_BUS_NONE; struct pcpu __pcpu[MAXCPU]; static struct trapframe proc0_tf; vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2]; vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2]; int early_boot = 1; int cold = 1; long realmem = 0; long Maxmem = 0; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t physmap[PHYSMAP_SIZE]; u_int physmap_idx; struct kva_md_info kmi; int64_t dcache_line_size; /* The minimum D cache line size */ int64_t icache_line_size; /* The minimum I cache line size */ int64_t idcache_line_size; /* The minimum cache line size */ int64_t dczva_line_size; /* The size of cache line the dc zva zeroes */ int has_pan; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* pagezero_* implementations are provided in support.S */ void pagezero_simple(void *); void pagezero_cache(void *); /* pagezero_simple is default pagezero */ void (*pagezero)(void *p) = pagezero_simple; static void pan_setup(void) { uint64_t id_aa64mfr1; id_aa64mfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); if (ID_AA64MMFR1_PAN(id_aa64mfr1) != ID_AA64MMFR1_PAN_NONE) has_pan = 1; } void pan_enable(void) { /* * The LLVM integrated assembler doesn't understand the PAN * PSTATE field. Because of this we need to manually create * the instruction in an asm block. This is equivalent to: * msr pan, #1 * * This sets the PAN bit, stopping the kernel from accessing * memory when userspace can also access it unless the kernel * uses the userspace load/store instructions. */ if (has_pan) { WRITE_SPECIALREG(sctlr_el1, READ_SPECIALREG(sctlr_el1) & ~SCTLR_SPAN); __asm __volatile(".inst 0xd500409f | (0x1 << 8)"); } } static void cpu_startup(void *dummy) { undef_init(); identify_cpu(); vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); } SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); int cpu_idle_wakeup(int cpu) { return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; regs->sp = frame->tf_sp; regs->lr = frame->tf_lr; regs->elr = frame->tf_elr; regs->spsr = frame->tf_spsr; memcpy(regs->x, frame->tf_x, sizeof(regs->x)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; frame->tf_sp = regs->sp; frame->tf_lr = regs->lr; frame->tf_elr = regs->elr; - frame->tf_spsr = regs->spsr; + frame->tf_spsr &= ~PSR_FLAGS; + frame->tf_spsr |= regs->spsr & PSR_FLAGS; memcpy(frame->tf_x, regs->x, sizeof(frame->tf_x)); return (0); } int fill_fpregs(struct thread *td, struct fpreg *regs) { #ifdef VFP struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) { /* * If we have just been running VFP instructions we will * need to save the state to memcpy it below. */ if (td == curthread) vfp_save_state(td, pcb); KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate, ("Called fill_fpregs while the kernel is using the VFP")); memcpy(regs->fp_q, pcb->pcb_fpustate.vfp_regs, sizeof(regs->fp_q)); regs->fp_cr = pcb->pcb_fpustate.vfp_fpcr; regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr; } else #endif memset(regs->fp_q, 0, sizeof(regs->fp_q)); return (0); } int set_fpregs(struct thread *td, struct fpreg *regs) { #ifdef VFP struct pcb *pcb; pcb = td->td_pcb; KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate, ("Called set_fpregs while the kernel is using the VFP")); memcpy(pcb->pcb_fpustate.vfp_regs, regs->fp_q, sizeof(regs->fp_q)); pcb->pcb_fpustate.vfp_fpcr = regs->fp_cr; pcb->pcb_fpustate.vfp_fpsr = regs->fp_sr; #endif return (0); } int fill_dbregs(struct thread *td, struct dbreg *regs) { printf("ARM64TODO: fill_dbregs"); return (EDOOFUS); } int set_dbregs(struct thread *td, struct dbreg *regs) { printf("ARM64TODO: set_dbregs"); return (EDOOFUS); } int ptrace_set_pc(struct thread *td, u_long addr) { printf("ARM64TODO: ptrace_set_pc"); return (EDOOFUS); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_spsr |= PSR_SS; td->td_pcb->pcb_flags |= PCB_SINGLE_STEP; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_spsr &= ~PSR_SS; td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP; return (0); } void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf = td->td_frame; memset(tf, 0, sizeof(struct trapframe)); - /* - * We need to set x0 for init as it doesn't call - * cpu_set_syscall_retval to copy the value. We also - * need to set td_retval for the cases where we do. - */ - tf->tf_x[0] = td->td_retval[0] = stack; + tf->tf_x[0] = stack; tf->tf_sp = STACKALIGN(stack); tf->tf_lr = imgp->entry_addr; tf->tf_elr = imgp->entry_addr; } /* Sanity check these are the same size, they will be memcpy'd to and fro */ CTASSERT(sizeof(((struct trapframe *)0)->tf_x) == sizeof((struct gpregs *)0)->gp_x); CTASSERT(sizeof(((struct trapframe *)0)->tf_x) == sizeof((struct reg *)0)->x); int get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret) { struct trapframe *tf = td->td_frame; if (clear_ret & GET_MC_CLEAR_RET) { mcp->mc_gpregs.gp_x[0] = 0; mcp->mc_gpregs.gp_spsr = tf->tf_spsr & ~PSR_C; } else { mcp->mc_gpregs.gp_x[0] = tf->tf_x[0]; mcp->mc_gpregs.gp_spsr = tf->tf_spsr; } memcpy(&mcp->mc_gpregs.gp_x[1], &tf->tf_x[1], sizeof(mcp->mc_gpregs.gp_x[1]) * (nitems(mcp->mc_gpregs.gp_x) - 1)); mcp->mc_gpregs.gp_sp = tf->tf_sp; mcp->mc_gpregs.gp_lr = tf->tf_lr; mcp->mc_gpregs.gp_elr = tf->tf_elr; return (0); } int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tf = td->td_frame; + uint32_t spsr; + spsr = mcp->mc_gpregs.gp_spsr; + if ((spsr & PSR_M_MASK) != PSR_M_EL0t || + (spsr & (PSR_F | PSR_I | PSR_A | PSR_D)) != 0) + return (EINVAL); + memcpy(tf->tf_x, mcp->mc_gpregs.gp_x, sizeof(tf->tf_x)); tf->tf_sp = mcp->mc_gpregs.gp_sp; tf->tf_lr = mcp->mc_gpregs.gp_lr; tf->tf_elr = mcp->mc_gpregs.gp_elr; tf->tf_spsr = mcp->mc_gpregs.gp_spsr; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { #ifdef VFP struct pcb *curpcb; critical_enter(); curpcb = curthread->td_pcb; if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) { /* * If we have just been running VFP instructions we will * need to save the state to memcpy it below. */ vfp_save_state(td, curpcb); KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate, ("Called get_fpcontext while the kernel is using the VFP")); KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0, ("Non-userspace FPU flags set in get_fpcontext")); memcpy(mcp->mc_fpregs.fp_q, curpcb->pcb_fpustate.vfp_regs, sizeof(mcp->mc_fpregs)); mcp->mc_fpregs.fp_cr = curpcb->pcb_fpustate.vfp_fpcr; mcp->mc_fpregs.fp_sr = curpcb->pcb_fpustate.vfp_fpsr; mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags; mcp->mc_flags |= _MC_FP_VALID; } critical_exit(); #endif } static void set_fpcontext(struct thread *td, mcontext_t *mcp) { #ifdef VFP struct pcb *curpcb; critical_enter(); if ((mcp->mc_flags & _MC_FP_VALID) != 0) { curpcb = curthread->td_pcb; /* * Discard any vfp state for the current thread, we * are about to override it. */ vfp_discard(td); KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate, ("Called set_fpcontext while the kernel is using the VFP")); memcpy(curpcb->pcb_fpustate.vfp_regs, mcp->mc_fpregs.fp_q, sizeof(mcp->mc_fpregs)); curpcb->pcb_fpustate.vfp_fpcr = mcp->mc_fpregs.fp_cr; curpcb->pcb_fpustate.vfp_fpsr = mcp->mc_fpregs.fp_sr; curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK; } critical_exit(); #endif } void cpu_idle(int busy) { spinlock_enter(); if (!busy) cpu_idleclock(); if (!sched_runnable()) __asm __volatile( "dsb sy \n" "wfi \n"); if (!busy) cpu_activeclock(); spinlock_exit(); } void cpu_halt(void) { /* We should have shutdown by now, if not enter a low power sleep */ intr_disable(); while (1) { __asm __volatile("wfi"); } } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* ARM64TODO TBD */ } /* Get current clock frequency for the given CPU ID. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { struct pcpu *pc; pc = pcpu_find(cpu_id); if (pc == NULL || rate == NULL) return (EINVAL); if (pc->pc_clock == 0) return (EOPNOTSUPP); *rate = pc->pc_clock; return (0); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } void spinlock_enter(void) { struct thread *td; register_t daif; td = curthread; if (td->td_md.md_spinlock_count == 0) { daif = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_daif = daif; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t daif; td = curthread; critical_exit(); daif = td->td_md.md_saved_daif; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(daif); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { ucontext_t uc; - uint32_t spsr; + int error; if (uap == NULL) return (EFAULT); if (copyin(uap->sigcntxp, &uc, sizeof(uc))) return (EFAULT); - spsr = uc.uc_mcontext.mc_gpregs.gp_spsr; - if ((spsr & PSR_M_MASK) != PSR_M_EL0t || - (spsr & (PSR_F | PSR_I | PSR_A | PSR_D)) != 0) - return (EINVAL); - - set_mcontext(td, &uc.uc_mcontext); + error = set_mcontext(td, &uc.uc_mcontext); + if (error != 0) + return (error); set_fpcontext(td, &uc.uc_mcontext); /* Restore signal mask. */ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { int i; for (i = 0; i < PCB_LR; i++) pcb->pcb_x[i] = tf->tf_x[i]; pcb->pcb_x[PCB_LR] = tf->tf_lr; pcb->pcb_pc = tf->tf_elr; pcb->pcb_sp = tf->tf_sp; } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td; struct proc *p; struct trapframe *tf; struct sigframe *fp, frame; struct sigacts *psp; struct sysentvec *sysent; int code, onstack, sig; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else { fp = (struct sigframe *)td->td_frame->tf_sp; } /* Make room, keeping the stack aligned */ fp--; fp = (struct sigframe *)STACKALIGN(fp); /* Fill in the frame to copy out */ get_mcontext(td, &frame.sf_uc.uc_mcontext, 0); get_fpcontext(td, &frame.sf_uc.uc_mcontext); frame.sf_si = ksi->ksi_info; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE; frame.sf_uc.uc_stack = td->td_sigstk; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Copy the sigframe out to the user's stack. */ if (copyout(&frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } tf->tf_x[0]= sig; tf->tf_x[1] = (register_t)&fp->sf_si; tf->tf_x[2] = (register_t)&fp->sf_uc; tf->tf_elr = (register_t)catcher; tf->tf_sp = (register_t)fp; sysent = p->p_sysent; if (sysent->sv_sigcode_base != 0) tf->tf_lr = (register_t)sysent->sv_sigcode_base; else tf->tf_lr = (register_t)(sysent->sv_psstrings - *(sysent->sv_szsigcode)); CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } static void init_proc0(vm_offset_t kstack) { struct pcpu *pcpup = &__pcpu[0]; proc_linkup0(&proc0, &thread0); thread0.td_kstack = kstack; thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1; thread0.td_pcb->pcb_fpflags = 0; thread0.td_pcb->pcb_fpusaved = &thread0.td_pcb->pcb_fpustate; thread0.td_pcb->pcb_vfpcpu = UINT_MAX; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; } typedef struct { uint32_t type; uint64_t phys_start; uint64_t virt_start; uint64_t num_pages; uint64_t attr; } EFI_MEMORY_DESCRIPTOR; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, u_int *physmap_idxp) { u_int i, insert_idx, _physmap_idx; _physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = _physmap_idx; for (i = 0; i <= _physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= _physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } _physmap_idx += 2; *physmap_idxp = _physmap_idx; if (_physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = _physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } #ifdef FDT static void add_fdt_mem_regions(struct mem_region *mr, int mrcnt, vm_paddr_t *physmap, u_int *physmap_idxp) { for (int i = 0; i < mrcnt; i++) { if (!add_physmap_entry(mr[i].mr_start, mr[i].mr_size, physmap, physmap_idxp)) break; } } #endif static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, u_int *physmap_idxp) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idxp)) break; } } #ifdef FDT static void try_load_dtb(caddr_t kmdp) { vm_offset_t dtbp; dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); if (dtbp == (vm_offset_t)NULL) { printf("ERROR loading DTB\n"); return; } if (OF_install(OFW_FDT, 0) == FALSE) panic("Cannot install FDT"); if (OF_init((void *)dtbp) != 0) panic("OF_init failed with the found device tree"); } #endif static bool bus_probe(void) { bool has_acpi, has_fdt; char *order, *env; has_acpi = has_fdt = false; #ifdef FDT has_fdt = (OF_peer(0) != 0); #endif #ifdef DEV_ACPI has_acpi = (acpi_find_table(ACPI_SIG_SPCR) != 0); #endif env = kern_getenv("kern.cfg.order"); if (env != NULL) { order = env; while (order != NULL) { if (has_acpi && strncmp(order, "acpi", 4) == 0 && (order[4] == ',' || order[4] == '\0')) { arm64_bus_method = ARM64_BUS_ACPI; break; } if (has_fdt && strncmp(order, "fdt", 3) == 0 && (order[3] == ',' || order[3] == '\0')) { arm64_bus_method = ARM64_BUS_FDT; break; } order = strchr(order, ','); } freeenv(env); /* If we set the bus method it is valid */ if (arm64_bus_method != ARM64_BUS_NONE) return (true); } /* If no order or an invalid order was set use the default */ if (arm64_bus_method == ARM64_BUS_NONE) { if (has_fdt) arm64_bus_method = ARM64_BUS_FDT; else if (has_acpi) arm64_bus_method = ARM64_BUS_ACPI; } /* * If no option was set the default is valid, otherwise we are * setting one to get cninit() working, then calling panic to tell * the user about the invalid bus setup. */ return (env == NULL); } static void cache_setup(void) { int dcache_line_shift, icache_line_shift, dczva_line_shift; uint32_t ctr_el0; uint32_t dczid_el0; ctr_el0 = READ_SPECIALREG(ctr_el0); /* Read the log2 words in each D cache line */ dcache_line_shift = CTR_DLINE_SIZE(ctr_el0); /* Get the D cache line size */ dcache_line_size = sizeof(int) << dcache_line_shift; /* And the same for the I cache */ icache_line_shift = CTR_ILINE_SIZE(ctr_el0); icache_line_size = sizeof(int) << icache_line_shift; idcache_line_size = MIN(dcache_line_size, icache_line_size); dczid_el0 = READ_SPECIALREG(dczid_el0); /* Check if dc zva is not prohibited */ if (dczid_el0 & DCZID_DZP) dczva_line_size = 0; else { /* Same as with above calculations */ dczva_line_shift = DCZID_BS_SIZE(dczid_el0); dczva_line_size = sizeof(int) << dczva_line_shift; /* Change pagezero function */ pagezero = pagezero_cache; } } void initarm(struct arm64_bootparams *abp) { struct efi_map_header *efihdr; struct pcpu *pcpup; #ifdef FDT struct mem_region mem_regions[FDT_MEM_REGIONS]; int mem_regions_sz; #endif vm_offset_t lastaddr; caddr_t kmdp; vm_paddr_t mem_len; bool valid; int i; /* Set the module data location */ preload_metadata = (caddr_t)(uintptr_t)(abp->modulep); /* Find the kernel address */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); #ifdef FDT try_load_dtb(kmdp); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); /* Find the address to start allocating from */ lastaddr = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); /* Load the physical memory ranges */ physmap_idx = 0; efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr != NULL) add_efi_map_entries(efihdr, physmap, &physmap_idx); #ifdef FDT else { /* Grab physical memory regions information from device tree. */ if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, NULL) != 0) panic("Cannot get physical memory regions"); add_fdt_mem_regions(mem_regions, mem_regions_sz, physmap, &physmap_idx); } #endif /* Print the memory map */ mem_len = 0; for (i = 0; i < physmap_idx; i += 2) { dump_avail[i] = physmap[i]; dump_avail[i + 1] = physmap[i + 1]; mem_len += physmap[i + 1] - physmap[i]; } dump_avail[i] = 0; dump_avail[i + 1] = 0; /* Set the pcpu data, this is needed by pmap_bootstrap */ pcpup = &__pcpu[0]; pcpu_init(pcpup, 0, sizeof(struct pcpu)); /* * Set the pcpu pointer with a backup in tpidr_el1 to be * loaded when entering the kernel from userland. */ __asm __volatile( "mov x18, %0 \n" "msr tpidr_el1, %0" :: "r"(pcpup)); PCPU_SET(curthread, &thread0); /* Do basic tuning, hz etc */ init_param1(); cache_setup(); pan_setup(); /* Bootstrap enough of pmap to enter the kernel proper */ pmap_bootstrap(abp->kern_l0pt, abp->kern_l1pt, KERNBASE - abp->kern_delta, lastaddr - KERNBASE); devmap_bootstrap(0, NULL); valid = bus_probe(); cninit(); if (!valid) panic("Invalid bus configuration: %s", kern_getenv("kern.cfg.order")); init_proc0(abp->kern_stack); msgbufinit(msgbufp, msgbufsize); mutex_init(); init_param2(physmem); dbg_init(); kdb_init(); pan_enable(); early_boot = 0; } void dbg_init(void) { /* Clear OS lock */ WRITE_SPECIALREG(OSLAR_EL1, 0); /* This permits DDB to use debug registers for watchpoints. */ dbg_monitor_init(); /* TODO: Eventually will need to initialize debug registers here. */ } #ifdef DDB #include DB_SHOW_COMMAND(specialregs, db_show_spregs) { #define PRINT_REG(reg) \ db_printf(__STRING(reg) " = %#016lx\n", READ_SPECIALREG(reg)) PRINT_REG(actlr_el1); PRINT_REG(afsr0_el1); PRINT_REG(afsr1_el1); PRINT_REG(aidr_el1); PRINT_REG(amair_el1); PRINT_REG(ccsidr_el1); PRINT_REG(clidr_el1); PRINT_REG(contextidr_el1); PRINT_REG(cpacr_el1); PRINT_REG(csselr_el1); PRINT_REG(ctr_el0); PRINT_REG(currentel); PRINT_REG(daif); PRINT_REG(dczid_el0); PRINT_REG(elr_el1); PRINT_REG(esr_el1); PRINT_REG(far_el1); #if 0 /* ARM64TODO: Enable VFP before reading floating-point registers */ PRINT_REG(fpcr); PRINT_REG(fpsr); #endif PRINT_REG(id_aa64afr0_el1); PRINT_REG(id_aa64afr1_el1); PRINT_REG(id_aa64dfr0_el1); PRINT_REG(id_aa64dfr1_el1); PRINT_REG(id_aa64isar0_el1); PRINT_REG(id_aa64isar1_el1); PRINT_REG(id_aa64pfr0_el1); PRINT_REG(id_aa64pfr1_el1); PRINT_REG(id_afr0_el1); PRINT_REG(id_dfr0_el1); PRINT_REG(id_isar0_el1); PRINT_REG(id_isar1_el1); PRINT_REG(id_isar2_el1); PRINT_REG(id_isar3_el1); PRINT_REG(id_isar4_el1); PRINT_REG(id_isar5_el1); PRINT_REG(id_mmfr0_el1); PRINT_REG(id_mmfr1_el1); PRINT_REG(id_mmfr2_el1); PRINT_REG(id_mmfr3_el1); #if 0 /* Missing from llvm */ PRINT_REG(id_mmfr4_el1); #endif PRINT_REG(id_pfr0_el1); PRINT_REG(id_pfr1_el1); PRINT_REG(isr_el1); PRINT_REG(mair_el1); PRINT_REG(midr_el1); PRINT_REG(mpidr_el1); PRINT_REG(mvfr0_el1); PRINT_REG(mvfr1_el1); PRINT_REG(mvfr2_el1); PRINT_REG(revidr_el1); PRINT_REG(sctlr_el1); PRINT_REG(sp_el0); PRINT_REG(spsel); PRINT_REG(spsr_el1); PRINT_REG(tcr_el1); PRINT_REG(tpidr_el0); PRINT_REG(tpidr_el1); PRINT_REG(tpidrro_el0); PRINT_REG(ttbr0_el1); PRINT_REG(ttbr1_el1); PRINT_REG(vbar_el1); #undef PRINT_REG } DB_SHOW_COMMAND(vtop, db_show_vtop) { uint64_t phys; if (have_addr) { phys = arm64_address_translate_s1e1r(addr); db_printf("EL1 physical address reg (read): 0x%016lx\n", phys); phys = arm64_address_translate_s1e1w(addr); db_printf("EL1 physical address reg (write): 0x%016lx\n", phys); phys = arm64_address_translate_s1e0r(addr); db_printf("EL0 physical address reg (read): 0x%016lx\n", phys); phys = arm64_address_translate_s1e0w(addr); db_printf("EL0 physical address reg (write): 0x%016lx\n", phys); } else db_printf("show vtop \n"); } #endif Index: projects/bsd_rdma_4_9/sys/arm64/cloudabi64/cloudabi64_sysvec.c =================================================================== --- projects/bsd_rdma_4_9/sys/arm64/cloudabi64/cloudabi64_sysvec.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/arm64/cloudabi64/cloudabi64_sysvec.c (revision 326162) @@ -1,189 +1,189 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi64_syscallnames[]; extern struct sysent cloudabi64_sysent[]; static void cloudabi64_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB and the auxiliary * vector. Let x0 point to the auxiliary vector, and set * tpidr_el0 to the TCB. */ regs = td->td_frame; - regs->tf_x[0] = td->td_retval[0] = + regs->tf_x[0] = stack + roundup(sizeof(cloudabi64_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, (void *)stack); } static int cloudabi64_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int i; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_x[8]; if (sa->code >= CLOUDABI64_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi64_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* Fetch system call arguments. */ for (i = 0; i < MAXARGS; i++) sa->args[i] = frame->tf_x[i]; /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_x[1]; return (0); } static void cloudabi64_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_x[0] = td->td_retval[0]; frame->tf_x[1] = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* Restart system call. */ frame->tf_elr -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_x[0] = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi64_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* * Initial register values for processes returning from fork. * Make sure that we only set these values when forking, not * when creating a new thread. */ if ((td->td_pflags & TDP_FORKING) != 0) { frame->tf_x[0] = CLOUDABI_PROCESS_CHILD; frame->tf_x[1] = td->td_tid; } } int cloudabi64_thread_setregs(struct thread *td, const cloudabi64_threadattr_t *attr, uint64_t tcb) { struct trapframe *frame; stack_t stack; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_x[0] = td->td_tid; frame->tf_x[1] = attr->argument; /* Set up TLS. */ return (cpu_set_user_tls(td, (void *)tcb)); } static struct sysentvec cloudabi64_elf_sysvec = { .sv_size = CLOUDABI64_SYS_MAXSYSCALL, .sv_table = cloudabi64_sysent, .sv_fixup = cloudabi64_fixup, .sv_name = "CloudABI ELF64", .sv_coredump = elf64_coredump, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi64_copyout_strings, .sv_setregs = cloudabi64_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_LP64, .sv_set_syscall_retval = cloudabi64_set_syscall_retval, .sv_fetch_syscall_args = cloudabi64_fetch_syscall_args, .sv_syscallnames = cloudabi64_syscallnames, .sv_schedtail = cloudabi64_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi64_elf_sysvec); Elf64_Brandinfo cloudabi64_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_AARCH64, .sysvec = &cloudabi64_elf_sysvec, .flags = BI_CAN_EXEC_DYN | BI_BRAND_ONLY_STATIC, }; Index: projects/bsd_rdma_4_9/sys/arm64/include/armreg.h =================================================================== --- projects/bsd_rdma_4_9/sys/arm64/include/armreg.h (revision 326161) +++ projects/bsd_rdma_4_9/sys/arm64/include/armreg.h (revision 326162) @@ -1,646 +1,647 @@ /*- * Copyright (c) 2013, 2014 Andrew Turner * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_ARMREG_H_ #define _MACHINE_ARMREG_H_ #define INSN_SIZE 4 #define READ_SPECIALREG(reg) \ ({ uint64_t val; \ __asm __volatile("mrs %0, " __STRING(reg) : "=&r" (val)); \ val; \ }) #define WRITE_SPECIALREG(reg, val) \ __asm __volatile("msr " __STRING(reg) ", %0" : : "r"((uint64_t)val)) /* CNTHCTL_EL2 - Counter-timer Hypervisor Control register */ #define CNTHCTL_EVNTI_MASK (0xf << 4) /* Bit to trigger event stream */ #define CNTHCTL_EVNTDIR (1 << 3) /* Control transition trigger bit */ #define CNTHCTL_EVNTEN (1 << 2) /* Enable event stream */ #define CNTHCTL_EL1PCEN (1 << 1) /* Allow EL0/1 physical timer access */ #define CNTHCTL_EL1PCTEN (1 << 0) /*Allow EL0/1 physical counter access*/ /* CPACR_EL1 */ #define CPACR_FPEN_MASK (0x3 << 20) #define CPACR_FPEN_TRAP_ALL1 (0x0 << 20) /* Traps from EL0 and EL1 */ #define CPACR_FPEN_TRAP_EL0 (0x1 << 20) /* Traps from EL0 */ #define CPACR_FPEN_TRAP_ALL2 (0x2 << 20) /* Traps from EL0 and EL1 */ #define CPACR_FPEN_TRAP_NONE (0x3 << 20) /* No traps */ #define CPACR_TTA (0x1 << 28) /* CTR_EL0 - Cache Type Register */ #define CTR_DLINE_SHIFT 16 #define CTR_DLINE_MASK (0xf << CTR_DLINE_SHIFT) #define CTR_DLINE_SIZE(reg) (((reg) & CTR_DLINE_MASK) >> CTR_DLINE_SHIFT) #define CTR_ILINE_SHIFT 0 #define CTR_ILINE_MASK (0xf << CTR_ILINE_SHIFT) #define CTR_ILINE_SIZE(reg) (((reg) & CTR_ILINE_MASK) >> CTR_ILINE_SHIFT) /* DCZID_EL0 - Data Cache Zero ID register */ #define DCZID_DZP (1 << 4) /* DC ZVA prohibited if non-0 */ #define DCZID_BS_SHIFT 0 #define DCZID_BS_MASK (0xf << DCZID_BS_SHIFT) #define DCZID_BS_SIZE(reg) (((reg) & DCZID_BS_MASK) >> DCZID_BS_SHIFT) /* ESR_ELx */ #define ESR_ELx_ISS_MASK 0x00ffffff #define ISS_INSN_FnV (0x01 << 10) #define ISS_INSN_EA (0x01 << 9) #define ISS_INSN_S1PTW (0x01 << 7) #define ISS_INSN_IFSC_MASK (0x1f << 0) #define ISS_DATA_ISV (0x01 << 24) #define ISS_DATA_SAS_MASK (0x03 << 22) #define ISS_DATA_SSE (0x01 << 21) #define ISS_DATA_SRT_MASK (0x1f << 16) #define ISS_DATA_SF (0x01 << 15) #define ISS_DATA_AR (0x01 << 14) #define ISS_DATA_FnV (0x01 << 10) #define ISS_DATa_EA (0x01 << 9) #define ISS_DATa_CM (0x01 << 8) #define ISS_INSN_S1PTW (0x01 << 7) #define ISS_DATa_WnR (0x01 << 6) #define ISS_DATA_DFSC_MASK (0x3f << 0) #define ISS_DATA_DFSC_ASF_L0 (0x00 << 0) #define ISS_DATA_DFSC_ASF_L1 (0x01 << 0) #define ISS_DATA_DFSC_ASF_L2 (0x02 << 0) #define ISS_DATA_DFSC_ASF_L3 (0x03 << 0) #define ISS_DATA_DFSC_TF_L0 (0x04 << 0) #define ISS_DATA_DFSC_TF_L1 (0x05 << 0) #define ISS_DATA_DFSC_TF_L2 (0x06 << 0) #define ISS_DATA_DFSC_TF_L3 (0x07 << 0) #define ISS_DATA_DFSC_AFF_L1 (0x09 << 0) #define ISS_DATA_DFSC_AFF_L2 (0x0a << 0) #define ISS_DATA_DFSC_AFF_L3 (0x0b << 0) #define ISS_DATA_DFSC_PF_L1 (0x0d << 0) #define ISS_DATA_DFSC_PF_L2 (0x0e << 0) #define ISS_DATA_DFSC_PF_L3 (0x0f << 0) #define ISS_DATA_DFSC_EXT (0x10 << 0) #define ISS_DATA_DFSC_EXT_L0 (0x14 << 0) #define ISS_DATA_DFSC_EXT_L1 (0x15 << 0) #define ISS_DATA_DFSC_EXT_L2 (0x16 << 0) #define ISS_DATA_DFSC_EXT_L3 (0x17 << 0) #define ISS_DATA_DFSC_ECC (0x18 << 0) #define ISS_DATA_DFSC_ECC_L0 (0x1c << 0) #define ISS_DATA_DFSC_ECC_L1 (0x1d << 0) #define ISS_DATA_DFSC_ECC_L2 (0x1e << 0) #define ISS_DATA_DFSC_ECC_L3 (0x1f << 0) #define ISS_DATA_DFSC_ALIGN (0x21 << 0) #define ISS_DATA_DFSC_TLB_CONFLICT (0x30 << 0) #define ESR_ELx_IL (0x01 << 25) #define ESR_ELx_EC_SHIFT 26 #define ESR_ELx_EC_MASK (0x3f << 26) #define ESR_ELx_EXCEPTION(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define EXCP_UNKNOWN 0x00 /* Unkwn exception */ #define EXCP_FP_SIMD 0x07 /* VFP/SIMD trap */ #define EXCP_ILL_STATE 0x0e /* Illegal execution state */ #define EXCP_SVC 0x15 /* SVC trap */ #define EXCP_MSR 0x18 /* MSR/MRS trap */ #define EXCP_INSN_ABORT_L 0x20 /* Instruction abort, from lower EL */ #define EXCP_INSN_ABORT 0x21 /* Instruction abort, from same EL */ #define EXCP_PC_ALIGN 0x22 /* PC alignment fault */ #define EXCP_DATA_ABORT_L 0x24 /* Data abort, from lower EL */ #define EXCP_DATA_ABORT 0x25 /* Data abort, from same EL */ #define EXCP_SP_ALIGN 0x26 /* SP slignment fault */ #define EXCP_TRAP_FP 0x2c /* Trapped FP exception */ #define EXCP_SERROR 0x2f /* SError interrupt */ #define EXCP_SOFTSTP_EL0 0x32 /* Software Step, from lower EL */ #define EXCP_SOFTSTP_EL1 0x33 /* Software Step, from same EL */ #define EXCP_WATCHPT_EL1 0x35 /* Watchpoint, from same EL */ #define EXCP_BRK 0x3c /* Breakpoint */ /* ICC_CTLR_EL1 */ #define ICC_CTLR_EL1_EOIMODE (1U << 1) /* ICC_IAR1_EL1 */ #define ICC_IAR1_EL1_SPUR (0x03ff) /* ICC_IGRPEN0_EL1 */ #define ICC_IGRPEN0_EL1_EN (1U << 0) /* ICC_PMR_EL1 */ #define ICC_PMR_EL1_PRIO_MASK (0xFFUL) /* ICC_SGI1R_EL1 */ #define ICC_SGI1R_EL1_TL_MASK 0xffffUL #define ICC_SGI1R_EL1_AFF1_SHIFT 16 #define ICC_SGI1R_EL1_SGIID_SHIFT 24 #define ICC_SGI1R_EL1_AFF2_SHIFT 32 #define ICC_SGI1R_EL1_AFF3_SHIFT 48 #define ICC_SGI1R_EL1_SGIID_MASK 0xfUL #define ICC_SGI1R_EL1_IRM (0x1UL << 40) /* ICC_SRE_EL1 */ #define ICC_SRE_EL1_SRE (1U << 0) /* ICC_SRE_EL2 */ #define ICC_SRE_EL2_SRE (1U << 0) #define ICC_SRE_EL2_EN (1U << 3) /* ID_AA64DFR0_EL1 */ #define ID_AA64DFR0_MASK 0x0000000ff0f0fffful #define ID_AA64DFR0_DEBUG_VER_SHIFT 0 #define ID_AA64DFR0_DEBUG_VER_MASK (0xf << ID_AA64DFR0_DEBUG_VER_SHIFT) #define ID_AA64DFR0_DEBUG_VER(x) ((x) & ID_AA64DFR0_DEBUG_VER_MASK) #define ID_AA64DFR0_DEBUG_VER_8 (0x6 << ID_AA64DFR0_DEBUG_VER_SHIFT) #define ID_AA64DFR0_DEBUG_VER_8_VHE (0x7 << ID_AA64DFR0_DEBUG_VER_SHIFT) #define ID_AA64DFR0_DEBUG_VER_8_2 (0x8 << ID_AA64DFR0_DEBUG_VER_SHIFT) #define ID_AA64DFR0_TRACE_VER_SHIFT 4 #define ID_AA64DFR0_TRACE_VER_MASK (0xf << ID_AA64DFR0_TRACE_VER_SHIFT) #define ID_AA64DFR0_TRACE_VER(x) ((x) & ID_AA64DFR0_TRACE_VER_MASK) #define ID_AA64DFR0_TRACE_VER_NONE (0x0 << ID_AA64DFR0_TRACE_VER_SHIFT) #define ID_AA64DFR0_TRACE_VER_IMPL (0x1 << ID_AA64DFR0_TRACE_VER_SHIFT) #define ID_AA64DFR0_PMU_VER_SHIFT 8 #define ID_AA64DFR0_PMU_VER_MASK (0xf << ID_AA64DFR0_PMU_VER_SHIFT) #define ID_AA64DFR0_PMU_VER(x) ((x) & ID_AA64DFR0_PMU_VER_MASK) #define ID_AA64DFR0_PMU_VER_NONE (0x0 << ID_AA64DFR0_PMU_VER_SHIFT) #define ID_AA64DFR0_PMU_VER_3 (0x1 << ID_AA64DFR0_PMU_VER_SHIFT) #define ID_AA64DFR0_PMU_VER_3_1 (0x4 << ID_AA64DFR0_PMU_VER_SHIFT) #define ID_AA64DFR0_PMU_VER_IMPL (0xf << ID_AA64DFR0_PMU_VER_SHIFT) #define ID_AA64DFR0_BRPS_SHIFT 12 #define ID_AA64DFR0_BRPS_MASK (0xf << ID_AA64DFR0_BRPS_SHIFT) #define ID_AA64DFR0_BRPS(x) \ ((((x) >> ID_AA64DFR0_BRPS_SHIFT) & 0xf) + 1) #define ID_AA64DFR0_WRPS_SHIFT 20 #define ID_AA64DFR0_WRPS_MASK (0xf << ID_AA64DFR0_WRPS_SHIFT) #define ID_AA64DFR0_WRPS(x) \ ((((x) >> ID_AA64DFR0_WRPS_SHIFT) & 0xf) + 1) #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 #define ID_AA64DFR0_CTX_CMPS_MASK (0xf << ID_AA64DFR0_CTX_CMPS_SHIFT) #define ID_AA64DFR0_CTX_CMPS(x) \ ((((x) >> ID_AA64DFR0_CTX_CMPS_SHIFT) & 0xf) + 1) #define ID_AA64DFR0_PMS_VER_SHIFT 32 #define ID_AA64DFR0_PMS_VER_MASK (0xful << ID_AA64DFR0_PMS_VER_SHIFT) #define ID_AA64DFR0_PMS_VER(x) ((x) & ID_AA64DFR0_PMS_VER_MASK) #define ID_AA64DFR0_PMS_VER_NONE (0x0ul << ID_AA64DFR0_PMS_VER_SHIFT) #define ID_AA64DFR0_PMS_VER_V1 (0x1ul << ID_AA64DFR0_PMS_VER_SHIFT) /* ID_AA64ISAR0_EL1 */ #define ID_AA64ISAR0_MASK 0x0000fffff0fffff0ul #define ID_AA64ISAR0_AES_SHIFT 4 #define ID_AA64ISAR0_AES_MASK (0xf << ID_AA64ISAR0_AES_SHIFT) #define ID_AA64ISAR0_AES(x) ((x) & ID_AA64ISAR0_AES_MASK) #define ID_AA64ISAR0_AES_NONE (0x0 << ID_AA64ISAR0_AES_SHIFT) #define ID_AA64ISAR0_AES_BASE (0x1 << ID_AA64ISAR0_AES_SHIFT) #define ID_AA64ISAR0_AES_PMULL (0x2 << ID_AA64ISAR0_AES_SHIFT) #define ID_AA64ISAR0_SHA1_SHIFT 8 #define ID_AA64ISAR0_SHA1_MASK (0xf << ID_AA64ISAR0_SHA1_SHIFT) #define ID_AA64ISAR0_SHA1(x) ((x) & ID_AA64ISAR0_SHA1_MASK) #define ID_AA64ISAR0_SHA1_NONE (0x0 << ID_AA64ISAR0_SHA1_SHIFT) #define ID_AA64ISAR0_SHA1_BASE (0x1 << ID_AA64ISAR0_SHA1_SHIFT) #define ID_AA64ISAR0_SHA2_SHIFT 12 #define ID_AA64ISAR0_SHA2_MASK (0xf << ID_AA64ISAR0_SHA2_SHIFT) #define ID_AA64ISAR0_SHA2(x) ((x) & ID_AA64ISAR0_SHA2_MASK) #define ID_AA64ISAR0_SHA2_NONE (0x0 << ID_AA64ISAR0_SHA2_SHIFT) #define ID_AA64ISAR0_SHA2_BASE (0x1 << ID_AA64ISAR0_SHA2_SHIFT) #define ID_AA64ISAR0_SHA2_512 (0x2 << ID_AA64ISAR0_SHA2_SHIFT) #define ID_AA64ISAR0_CRC32_SHIFT 16 #define ID_AA64ISAR0_CRC32_MASK (0xf << ID_AA64ISAR0_CRC32_SHIFT) #define ID_AA64ISAR0_CRC32(x) ((x) & ID_AA64ISAR0_CRC32_MASK) #define ID_AA64ISAR0_CRC32_NONE (0x0 << ID_AA64ISAR0_CRC32_SHIFT) #define ID_AA64ISAR0_CRC32_BASE (0x1 << ID_AA64ISAR0_CRC32_SHIFT) #define ID_AA64ISAR0_ATOMIC_SHIFT 20 #define ID_AA64ISAR0_ATOMIC_MASK (0xf << ID_AA64ISAR0_ATOMIC_SHIFT) #define ID_AA64ISAR0_ATOMIC(x) ((x) & ID_AA64ISAR0_ATOMIC_MASK) #define ID_AA64ISAR0_ATOMIC_NONE (0x0 << ID_AA64ISAR0_ATOMIC_SHIFT) #define ID_AA64ISAR0_ATOMIC_IMPL (0x2 << ID_AA64ISAR0_ATOMIC_SHIFT) #define ID_AA64ISAR0_RDM_SHIFT 28 #define ID_AA64ISAR0_RDM_MASK (0xf << ID_AA64ISAR0_RDM_SHIFT) #define ID_AA64ISAR0_RDM(x) ((x) & ID_AA64ISAR0_RDM_MASK) #define ID_AA64ISAR0_RDM_NONE (0x0 << ID_AA64ISAR0_RDM_SHIFT) #define ID_AA64ISAR0_RDM_IMPL (0x1 << ID_AA64ISAR0_RDM_SHIFT) #define ID_AA64ISAR0_SHA3_SHIFT 32 #define ID_AA64ISAR0_SHA3_MASK (0xful << ID_AA64ISAR0_SHA3_SHIFT) #define ID_AA64ISAR0_SHA3(x) ((x) & ID_AA64ISAR0_SHA3_MASK) #define ID_AA64ISAR0_SHA3_NONE (0x0ul << ID_AA64ISAR0_SHA3_SHIFT) #define ID_AA64ISAR0_SHA3_IMPL (0x1ul << ID_AA64ISAR0_SHA3_SHIFT) #define ID_AA64ISAR0_SM3_SHIFT 36 #define ID_AA64ISAR0_SM3_MASK (0xful << ID_AA64ISAR0_SM3_SHIFT) #define ID_AA64ISAR0_SM3(x) ((x) & ID_AA64ISAR0_SM3_MASK) #define ID_AA64ISAR0_SM3_NONE (0x0ul << ID_AA64ISAR0_SM3_SHIFT) #define ID_AA64ISAR0_SM3_IMPL (0x1ul << ID_AA64ISAR0_SM3_SHIFT) #define ID_AA64ISAR0_SM4_SHIFT 40 #define ID_AA64ISAR0_SM4_MASK (0xful << ID_AA64ISAR0_SM4_SHIFT) #define ID_AA64ISAR0_SM4(x) ((x) & ID_AA64ISAR0_SM4_MASK) #define ID_AA64ISAR0_SM4_NONE (0x0ul << ID_AA64ISAR0_SM4_SHIFT) #define ID_AA64ISAR0_SM4_IMPL (0x1ul << ID_AA64ISAR0_SM4_SHIFT) #define ID_AA64ISAR0_DP_SHIFT 48 #define ID_AA64ISAR0_DP_MASK (0xful << ID_AA64ISAR0_DP_SHIFT) #define ID_AA64ISAR0_DP(x) ((x) & ID_AA64ISAR0_DP_MASK) #define ID_AA64ISAR0_DP_NONE (0x0ul << ID_AA64ISAR0_DP_SHIFT) #define ID_AA64ISAR0_DP_IMPL (0x1ul << ID_AA64ISAR0_DP_SHIFT) /* ID_AA64ISAR1_EL1 */ #define ID_AA64ISAR1_MASK 0xffffffff #define ID_AA64ISAR1_DPB_SHIFT 0 #define ID_AA64ISAR1_DPB_MASK (0xf << ID_AA64ISAR1_DPB_SHIFT) #define ID_AA64ISAR1_DPB(x) ((x) & ID_AA64ISAR1_DPB_MASK) #define ID_AA64ISAR1_DPB_NONE (0x0 << ID_AA64ISAR1_DPB_SHIFT) #define ID_AA64ISAR1_DPB_IMPL (0x1 << ID_AA64ISAR1_DPB_SHIFT) #define ID_AA64ISAR1_APA_SHIFT 4 #define ID_AA64ISAR1_APA_MASK (0xf << ID_AA64ISAR1_APA_SHIFT) #define ID_AA64ISAR1_APA(x) ((x) & ID_AA64ISAR1_APA_MASK) #define ID_AA64ISAR1_APA_NONE (0x0 << ID_AA64ISAR1_APA_SHIFT) #define ID_AA64ISAR1_APA_IMPL (0x1 << ID_AA64ISAR1_APA_SHIFT) #define ID_AA64ISAR1_API_SHIFT 8 #define ID_AA64ISAR1_API_MASK (0xf << ID_AA64ISAR1_API_SHIFT) #define ID_AA64ISAR1_API(x) ((x) & ID_AA64ISAR1_API_MASK) #define ID_AA64ISAR1_API_NONE (0x0 << ID_AA64ISAR1_API_SHIFT) #define ID_AA64ISAR1_API_IMPL (0x1 << ID_AA64ISAR1_API_SHIFT) #define ID_AA64ISAR1_JSCVT_SHIFT 12 #define ID_AA64ISAR1_JSCVT_MASK (0xf << ID_AA64ISAR1_JSCVT_SHIFT) #define ID_AA64ISAR1_JSCVT(x) ((x) & ID_AA64ISAR1_JSCVT_MASK) #define ID_AA64ISAR1_JSCVT_NONE (0x0 << ID_AA64ISAR1_JSCVT_SHIFT) #define ID_AA64ISAR1_JSCVT_IMPL (0x1 << ID_AA64ISAR1_JSCVT_SHIFT) #define ID_AA64ISAR1_FCMA_SHIFT 16 #define ID_AA64ISAR1_FCMA_MASK (0xf << ID_AA64ISAR1_FCMA_SHIFT) #define ID_AA64ISAR1_FCMA(x) ((x) & ID_AA64ISAR1_FCMA_MASK) #define ID_AA64ISAR1_FCMA_NONE (0x0 << ID_AA64ISAR1_FCMA_SHIFT) #define ID_AA64ISAR1_FCMA_IMPL (0x1 << ID_AA64ISAR1_FCMA_SHIFT) #define ID_AA64ISAR1_LRCPC_SHIFT 20 #define ID_AA64ISAR1_LRCPC_MASK (0xf << ID_AA64ISAR1_LRCPC_SHIFT) #define ID_AA64ISAR1_LRCPC(x) ((x) & ID_AA64ISAR1_LRCPC_MASK) #define ID_AA64ISAR1_LRCPC_NONE (0x0 << ID_AA64ISAR1_LRCPC_SHIFT) #define ID_AA64ISAR1_LRCPC_IMPL (0x1 << ID_AA64ISAR1_LRCPC_SHIFT) #define ID_AA64ISAR1_GPA_SHIFT 24 #define ID_AA64ISAR1_GPA_MASK (0xf << ID_AA64ISAR1_GPA_SHIFT) #define ID_AA64ISAR1_GPA(x) ((x) & ID_AA64ISAR1_GPA_MASK) #define ID_AA64ISAR1_GPA_NONE (0x0 << ID_AA64ISAR1_GPA_SHIFT) #define ID_AA64ISAR1_GPA_IMPL (0x1 << ID_AA64ISAR1_GPA_SHIFT) #define ID_AA64ISAR1_GPI_SHIFT 28 #define ID_AA64ISAR1_GPI_MASK (0xf << ID_AA64ISAR1_GPI_SHIFT) #define ID_AA64ISAR1_GPI(x) ((x) & ID_AA64ISAR1_GPI_MASK) #define ID_AA64ISAR1_GPI_NONE (0x0 << ID_AA64ISAR1_GPI_SHIFT) #define ID_AA64ISAR1_GPI_IMPL (0x1 << ID_AA64ISAR1_GPI_SHIFT) /* ID_AA64MMFR0_EL1 */ #define ID_AA64MMFR0_MASK 0xffffffff #define ID_AA64MMFR0_PA_RANGE_SHIFT 0 #define ID_AA64MMFR0_PA_RANGE_MASK (0xf << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE(x) ((x) & ID_AA64MMFR0_PA_RANGE_MASK) #define ID_AA64MMFR0_PA_RANGE_4G (0x0 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE_64G (0x1 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE_1T (0x2 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE_4T (0x3 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE_16T (0x4 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE_256T (0x5 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_PA_RANGE_4P (0x6 << ID_AA64MMFR0_PA_RANGE_SHIFT) #define ID_AA64MMFR0_ASID_BITS_SHIFT 4 #define ID_AA64MMFR0_ASID_BITS_MASK (0xf << ID_AA64MMFR0_ASID_BITS_SHIFT) #define ID_AA64MMFR0_ASID_BITS(x) ((x) & ID_AA64MMFR0_ASID_BITS_MASK) #define ID_AA64MMFR0_ASID_BITS_8 (0x0 << ID_AA64MMFR0_ASID_BITS_SHIFT) #define ID_AA64MMFR0_ASID_BITS_16 (0x2 << ID_AA64MMFR0_ASID_BITS_SHIFT) #define ID_AA64MMFR0_BIGEND_SHIFT 8 #define ID_AA64MMFR0_BIGEND_MASK (0xf << ID_AA64MMFR0_BIGEND_SHIFT) #define ID_AA64MMFR0_BIGEND(x) ((x) & ID_AA64MMFR0_BIGEND_MASK) #define ID_AA64MMFR0_BIGEND_FIXED (0x0 << ID_AA64MMFR0_BIGEND_SHIFT) #define ID_AA64MMFR0_BIGEND_MIXED (0x1 << ID_AA64MMFR0_BIGEND_SHIFT) #define ID_AA64MMFR0_S_NS_MEM_SHIFT 12 #define ID_AA64MMFR0_S_NS_MEM_MASK (0xf << ID_AA64MMFR0_S_NS_MEM_SHIFT) #define ID_AA64MMFR0_S_NS_MEM(x) ((x) & ID_AA64MMFR0_S_NS_MEM_MASK) #define ID_AA64MMFR0_S_NS_MEM_NONE (0x0 << ID_AA64MMFR0_S_NS_MEM_SHIFT) #define ID_AA64MMFR0_S_NS_MEM_DISTINCT (0x1 << ID_AA64MMFR0_S_NS_MEM_SHIFT) #define ID_AA64MMFR0_BIGEND_EL0_SHIFT 16 #define ID_AA64MMFR0_BIGEND_EL0_MASK (0xf << ID_AA64MMFR0_BIGEND_EL0_SHIFT) #define ID_AA64MMFR0_BIGEND_EL0(x) ((x) & ID_AA64MMFR0_BIGEND_EL0_MASK) #define ID_AA64MMFR0_BIGEND_EL0_FIXED (0x0 << ID_AA64MMFR0_BIGEND_EL0_SHIFT) #define ID_AA64MMFR0_BIGEND_EL0_MIXED (0x1 << ID_AA64MMFR0_BIGEND_EL0_SHIFT) #define ID_AA64MMFR0_TGRAN16_SHIFT 20 #define ID_AA64MMFR0_TGRAN16_MASK (0xf << ID_AA64MMFR0_TGRAN16_SHIFT) #define ID_AA64MMFR0_TGRAN16(x) ((x) & ID_AA64MMFR0_TGRAN16_MASK) #define ID_AA64MMFR0_TGRAN16_NONE (0x0 << ID_AA64MMFR0_TGRAN16_SHIFT) #define ID_AA64MMFR0_TGRAN16_IMPL (0x1 << ID_AA64MMFR0_TGRAN16_SHIFT) #define ID_AA64MMFR0_TGRAN64_SHIFT 24 #define ID_AA64MMFR0_TGRAN64_MASK (0xf << ID_AA64MMFR0_TGRAN64_SHIFT) #define ID_AA64MMFR0_TGRAN64(x) ((x) & ID_AA64MMFR0_TGRAN64_MASK) #define ID_AA64MMFR0_TGRAN64_IMPL (0x0 << ID_AA64MMFR0_TGRAN64_SHIFT) #define ID_AA64MMFR0_TGRAN64_NONE (0xf << ID_AA64MMFR0_TGRAN64_SHIFT) #define ID_AA64MMFR0_TGRAN4_SHIFT 28 #define ID_AA64MMFR0_TGRAN4_MASK (0xf << ID_AA64MMFR0_TGRAN4_SHIFT) #define ID_AA64MMFR0_TGRAN4(x) ((x) & ID_AA64MMFR0_TGRAN4_MASK) #define ID_AA64MMFR0_TGRAN4_IMPL (0x0 << ID_AA64MMFR0_TGRAN4_SHIFT) #define ID_AA64MMFR0_TGRAN4_NONE (0xf << ID_AA64MMFR0_TGRAN4_SHIFT) /* ID_AA64MMFR1_EL1 */ #define ID_AA64MMFR1_MASK 0xffffffff #define ID_AA64MMFR1_HAFDBS_SHIFT 0 #define ID_AA64MMFR1_HAFDBS_MASK (0xf << ID_AA64MMFR1_HAFDBS_SHIFT) #define ID_AA64MMFR1_HAFDBS(x) ((x) & ID_AA64MMFR1_HAFDBS_MASK) #define ID_AA64MMFR1_HAFDBS_NONE (0x0 << ID_AA64MMFR1_HAFDBS_SHIFT) #define ID_AA64MMFR1_HAFDBS_AF (0x1 << ID_AA64MMFR1_HAFDBS_SHIFT) #define ID_AA64MMFR1_HAFDBS_AF_DBS (0x2 << ID_AA64MMFR1_HAFDBS_SHIFT) #define ID_AA64MMFR1_VMIDBITS_SHIFT 4 #define ID_AA64MMFR1_VMIDBITS_MASK (0xf << ID_AA64MMFR1_VMIDBITS_SHIFT) #define ID_AA64MMFR1_VMIDBITS(x) ((x) & ID_AA64MMFR1_VMIDBITS_MASK) #define ID_AA64MMFR1_VMIDBITS_8 (0x0 << ID_AA64MMFR1_VMIDBITS_SHIFT) #define ID_AA64MMFR1_VMIDBITS_16 (0x2 << ID_AA64MMFR1_VMIDBITS_SHIFT) #define ID_AA64MMFR1_VH_SHIFT 8 #define ID_AA64MMFR1_VH_MASK (0xf << ID_AA64MMFR1_VH_SHIFT) #define ID_AA64MMFR1_VH(x) ((x) & ID_AA64MMFR1_VH_MASK) #define ID_AA64MMFR1_VH_NONE (0x0 << ID_AA64MMFR1_VH_SHIFT) #define ID_AA64MMFR1_VH_IMPL (0x1 << ID_AA64MMFR1_VH_SHIFT) #define ID_AA64MMFR1_HPDS_SHIFT 12 #define ID_AA64MMFR1_HPDS_MASK (0xf << ID_AA64MMFR1_HPDS_SHIFT) #define ID_AA64MMFR1_HPDS(x) ((x) & ID_AA64MMFR1_HPDS_MASK) #define ID_AA64MMFR1_HPDS_NONE (0x0 << ID_AA64MMFR1_HPDS_SHIFT) #define ID_AA64MMFR1_HPDS_HPD (0x1 << ID_AA64MMFR1_HPDS_SHIFT) #define ID_AA64MMFR1_HPDS_TTPBHA (0x2 << ID_AA64MMFR1_HPDS_SHIFT) #define ID_AA64MMFR1_LO_SHIFT 16 #define ID_AA64MMFR1_LO_MASK (0xf << ID_AA64MMFR1_LO_SHIFT) #define ID_AA64MMFR1_LO(x) ((x) & ID_AA64MMFR1_LO_MASK) #define ID_AA64MMFR1_LO_NONE (0x0 << ID_AA64MMFR1_LO_SHIFT) #define ID_AA64MMFR1_LO_IMPL (0x1 << ID_AA64MMFR1_LO_SHIFT) #define ID_AA64MMFR1_PAN_SHIFT 20 #define ID_AA64MMFR1_PAN_MASK (0xf << ID_AA64MMFR1_PAN_SHIFT) #define ID_AA64MMFR1_PAN(x) ((x) & ID_AA64MMFR1_PAN_MASK) #define ID_AA64MMFR1_PAN_NONE (0x0 << ID_AA64MMFR1_PAN_SHIFT) #define ID_AA64MMFR1_PAN_IMPL (0x1 << ID_AA64MMFR1_PAN_SHIFT) #define ID_AA64MMFR1_PAN_ATS1E1 (0x2 << ID_AA64MMFR1_PAN_SHIFT) #define ID_AA64MMFR1_SPEC_SEI_SHIFT 24 #define ID_AA64MMFR1_SPEC_SEI_MASK (0xf << ID_AA64MMFR1_SPEC_SEI_SHIFT) #define ID_AA64MMFR1_SPEC_SEI(x) ((x) & ID_AA64MMFR1_SPEC_SEI_MASK) #define ID_AA64MMFR1_SPEC_SEI_NONE (0x0 << ID_AA64MMFR1_SPEC_SEI_SHIFT) #define ID_AA64MMFR1_SPEC_SEI_IMPL (0x1 << ID_AA64MMFR1_SPEC_SEI_SHIFT) #define ID_AA64MMFR1_XNX_SHIFT 28 #define ID_AA64MMFR1_XNX_MASK (0xf << ID_AA64MMFR1_XNX_SHIFT) #define ID_AA64MMFR1_XNX(x) ((x) & ID_AA64MMFR1_XNX_MASK) #define ID_AA64MMFR1_XNX_NONE (0x0 << ID_AA64MMFR1_XNX_SHIFT) #define ID_AA64MMFR1_XNX_IMPL (0x1 << ID_AA64MMFR1_XNX_SHIFT) /* ID_AA64MMFR2_EL1 */ #define ID_AA64MMFR2_EL1 S3_0_C0_C7_2 #define ID_AA64MMFR2_MASK 0x0fffffff #define ID_AA64MMFR2_CNP_SHIFT 0 #define ID_AA64MMFR2_CNP_MASK (0xf << ID_AA64MMFR2_CNP_SHIFT) #define ID_AA64MMFR2_CNP(x) ((x) & ID_AA64MMFR2_CNP_MASK) #define ID_AA64MMFR2_CNP_NONE (0x0 << ID_AA64MMFR2_CNP_SHIFT) #define ID_AA64MMFR2_CNP_IMPL (0x1 << ID_AA64MMFR2_CNP_SHIFT) #define ID_AA64MMFR2_UAO_SHIFT 4 #define ID_AA64MMFR2_UAO_MASK (0xf << ID_AA64MMFR2_UAO_SHIFT) #define ID_AA64MMFR2_UAO(x) ((x) & ID_AA64MMFR2_UAO_MASK) #define ID_AA64MMFR2_UAO_NONE (0x0 << ID_AA64MMFR2_UAO_SHIFT) #define ID_AA64MMFR2_UAO_IMPL (0x1 << ID_AA64MMFR2_UAO_SHIFT) #define ID_AA64MMFR2_LSM_SHIFT 8 #define ID_AA64MMFR2_LSM_MASK (0xf << ID_AA64MMFR2_LSM_SHIFT) #define ID_AA64MMFR2_LSM(x) ((x) & ID_AA64MMFR2_LSM_MASK) #define ID_AA64MMFR2_LSM_NONE (0x0 << ID_AA64MMFR2_LSM_SHIFT) #define ID_AA64MMFR2_LSM_IMPL (0x1 << ID_AA64MMFR2_LSM_SHIFT) #define ID_AA64MMFR2_IESB_SHIFT 12 #define ID_AA64MMFR2_IESB_MASK (0xf << ID_AA64MMFR2_IESB_SHIFT) #define ID_AA64MMFR2_IESB(x) ((x) & ID_AA64MMFR2_IESB_MASK) #define ID_AA64MMFR2_IESB_NONE (0x0 << ID_AA64MMFR2_IESB_SHIFT) #define ID_AA64MMFR2_IESB_IMPL (0x1 << ID_AA64MMFR2_IESB_SHIFT) #define ID_AA64MMFR2_VA_RANGE_SHIFT 16 #define ID_AA64MMFR2_VA_RANGE_MASK (0xf << ID_AA64MMFR2_VA_RANGE_SHIFT) #define ID_AA64MMFR2_VA_RANGE(x) ((x) & ID_AA64MMFR2_VA_RANGE_MASK) #define ID_AA64MMFR2_VA_RANGE_48 (0x0 << ID_AA64MMFR2_VA_RANGE_SHIFT) #define ID_AA64MMFR2_VA_RANGE_52 (0x1 << ID_AA64MMFR2_VA_RANGE_SHIFT) #define ID_AA64MMFR2_CCIDX_SHIFT 20 #define ID_AA64MMFR2_CCIDX_MASK (0xf << ID_AA64MMFR2_CCIDX_SHIFT) #define ID_AA64MMFR2_CCIDX(x) ((x) & ID_AA64MMFR2_CCIDX_MASK) #define ID_AA64MMFR2_CCIDX_32 (0x0 << ID_AA64MMFR2_CCIDX_SHIFT) #define ID_AA64MMFR2_CCIDX_64 (0x1 << ID_AA64MMFR2_CCIDX_SHIFT) #define ID_AA64MMFR2_NV_SHIFT 24 #define ID_AA64MMFR2_NV_MASK (0xf << ID_AA64MMFR2_NV_SHIFT) #define ID_AA64MMFR2_NV(x) ((x) & ID_AA64MMFR2_NV_MASK) #define ID_AA64MMFR2_NV_NONE (0x0 << ID_AA64MMFR2_NV_SHIFT) #define ID_AA64MMFR2_NV_IMPL (0x1 << ID_AA64MMFR2_NV_SHIFT) /* ID_AA64PFR0_EL1 */ #define ID_AA64PFR0_MASK 0x0000000ffffffffful #define ID_AA64PFR0_EL0_SHIFT 0 #define ID_AA64PFR0_EL0_MASK (0xf << ID_AA64PFR0_EL0_SHIFT) #define ID_AA64PFR0_EL0(x) ((x) & ID_AA64PFR0_EL0_MASK) #define ID_AA64PFR0_EL0_64 (1 << ID_AA64PFR0_EL0_SHIFT) #define ID_AA64PFR0_EL0_64_32 (2 << ID_AA64PFR0_EL0_SHIFT) #define ID_AA64PFR0_EL1_SHIFT 4 #define ID_AA64PFR0_EL1_MASK (0xf << ID_AA64PFR0_EL1_SHIFT) #define ID_AA64PFR0_EL1(x) ((x) & ID_AA64PFR0_EL1_MASK) #define ID_AA64PFR0_EL1_64 (1 << ID_AA64PFR0_EL1_SHIFT) #define ID_AA64PFR0_EL1_64_32 (2 << ID_AA64PFR0_EL1_SHIFT) #define ID_AA64PFR0_EL2_SHIFT 8 #define ID_AA64PFR0_EL2_MASK (0xf << ID_AA64PFR0_EL2_SHIFT) #define ID_AA64PFR0_EL2(x) ((x) & ID_AA64PFR0_EL2_MASK) #define ID_AA64PFR0_EL2_NONE (0 << ID_AA64PFR0_EL2_SHIFT) #define ID_AA64PFR0_EL2_64 (1 << ID_AA64PFR0_EL2_SHIFT) #define ID_AA64PFR0_EL2_64_32 (2 << ID_AA64PFR0_EL2_SHIFT) #define ID_AA64PFR0_EL3_SHIFT 12 #define ID_AA64PFR0_EL3_MASK (0xf << ID_AA64PFR0_EL3_SHIFT) #define ID_AA64PFR0_EL3(x) ((x) & ID_AA64PFR0_EL3_MASK) #define ID_AA64PFR0_EL3_NONE (0 << ID_AA64PFR0_EL3_SHIFT) #define ID_AA64PFR0_EL3_64 (1 << ID_AA64PFR0_EL3_SHIFT) #define ID_AA64PFR0_EL3_64_32 (2 << ID_AA64PFR0_EL3_SHIFT) #define ID_AA64PFR0_FP_SHIFT 16 #define ID_AA64PFR0_FP_MASK (0xf << ID_AA64PFR0_FP_SHIFT) #define ID_AA64PFR0_FP(x) ((x) & ID_AA64PFR0_FP_MASK) #define ID_AA64PFR0_FP_IMPL (0x0 << ID_AA64PFR0_FP_SHIFT) #define ID_AA64PFR0_FP_HP (0x1 << ID_AA64PFR0_FP_SHIFT) #define ID_AA64PFR0_FP_NONE (0xf << ID_AA64PFR0_FP_SHIFT) #define ID_AA64PFR0_ADV_SIMD_SHIFT 20 #define ID_AA64PFR0_ADV_SIMD_MASK (0xf << ID_AA64PFR0_ADV_SIMD_SHIFT) #define ID_AA64PFR0_ADV_SIMD(x) ((x) & ID_AA64PFR0_ADV_SIMD_MASK) #define ID_AA64PFR0_ADV_SIMD_IMPL (0x0 << ID_AA64PFR0_ADV_SIMD_SHIFT) #define ID_AA64PFR0_ADV_SIMD_HP (0x1 << ID_AA64PFR0_ADV_SIMD_SHIFT) #define ID_AA64PFR0_ADV_SIMD_NONE (0xf << ID_AA64PFR0_ADV_SIMD_SHIFT) #define ID_AA64PFR0_GIC_BITS 0x4 /* Number of bits in GIC field */ #define ID_AA64PFR0_GIC_SHIFT 24 #define ID_AA64PFR0_GIC_MASK (0xf << ID_AA64PFR0_GIC_SHIFT) #define ID_AA64PFR0_GIC(x) ((x) & ID_AA64PFR0_GIC_MASK) #define ID_AA64PFR0_GIC_CPUIF_NONE (0x0 << ID_AA64PFR0_GIC_SHIFT) #define ID_AA64PFR0_GIC_CPUIF_EN (0x1 << ID_AA64PFR0_GIC_SHIFT) #define ID_AA64PFR0_RAS_SHIFT 28 #define ID_AA64PFR0_RAS_MASK (0xf << ID_AA64PFR0_RAS_SHIFT) #define ID_AA64PFR0_RAS(x) ((x) & ID_AA64PFR0_RAS_MASK) #define ID_AA64PFR0_RAS_NONE (0x0 << ID_AA64PFR0_RAS_SHIFT) #define ID_AA64PFR0_RAS_V1 (0x1 << ID_AA64PFR0_RAS_SHIFT) #define ID_AA64PFR0_SVE_SHIFT 32 #define ID_AA64PFR0_SVE_MASK (0xful << ID_AA64PFR0_SVE_SHIFT) #define ID_AA64PFR0_SVE(x) ((x) & ID_AA64PFR0_SVE_MASK) #define ID_AA64PFR0_SVE_NONE (0x0ul << ID_AA64PFR0_SVE_SHIFT) #define ID_AA64PFR0_SVE_IMPL (0x1ul << ID_AA64PFR0_SVE_SHIFT) /* MAIR_EL1 - Memory Attribute Indirection Register */ #define MAIR_ATTR_MASK(idx) (0xff << ((n)* 8)) #define MAIR_ATTR(attr, idx) ((attr) << ((idx) * 8)) #define MAIR_DEVICE_nGnRnE 0x00 #define MAIR_NORMAL_NC 0x44 #define MAIR_NORMAL_WT 0xbb #define MAIR_NORMAL_WB 0xff /* PAR_EL1 - Physical Address Register */ #define PAR_F_SHIFT 0 #define PAR_F (0x1 << PAR_F_SHIFT) #define PAR_SUCCESS(x) (((x) & PAR_F) == 0) /* When PAR_F == 0 (success) */ #define PAR_SH_SHIFT 7 #define PAR_SH_MASK (0x3 << PAR_SH_SHIFT) #define PAR_NS_SHIFT 9 #define PAR_NS_MASK (0x3 << PAR_NS_SHIFT) #define PAR_PA_SHIFT 12 #define PAR_PA_MASK 0x0000fffffffff000 #define PAR_ATTR_SHIFT 56 #define PAR_ATTR_MASK (0xff << PAR_ATTR_SHIFT) /* When PAR_F == 1 (aborted) */ #define PAR_FST_SHIFT 1 #define PAR_FST_MASK (0x3f << PAR_FST_SHIFT) #define PAR_PTW_SHIFT 8 #define PAR_PTW_MASK (0x1 << PAR_PTW_SHIFT) #define PAR_S_SHIFT 9 #define PAR_S_MASK (0x1 << PAR_S_SHIFT) /* SCTLR_EL1 - System Control Register */ #define SCTLR_RES0 0xc8222400 /* Reserved ARMv8.0, write 0 */ #define SCTLR_RES1 0x30d00800 /* Reserved ARMv8.0, write 1 */ #define SCTLR_M 0x00000001 #define SCTLR_A 0x00000002 #define SCTLR_C 0x00000004 #define SCTLR_SA 0x00000008 #define SCTLR_SA0 0x00000010 #define SCTLR_CP15BEN 0x00000020 #define SCTLR_THEE 0x00000040 #define SCTLR_ITD 0x00000080 #define SCTLR_SED 0x00000100 #define SCTLR_UMA 0x00000200 #define SCTLR_I 0x00001000 #define SCTLR_DZE 0x00004000 #define SCTLR_UCT 0x00008000 #define SCTLR_nTWI 0x00010000 #define SCTLR_nTWE 0x00040000 #define SCTLR_WXN 0x00080000 #define SCTLR_IESB 0x00200000 #define SCTLR_SPAN 0x00800000 #define SCTLR_EOE 0x01000000 #define SCTLR_EE 0x02000000 #define SCTLR_UCI 0x04000000 #define SCTLR_nTLSMD 0x10000000 #define SCTLR_LSMAOE 0x20000000 /* SPSR_EL1 */ /* * When the exception is taken in AArch64: * M[4] is 0 for AArch64 mode * M[3:2] is the exception level * M[1] is unused * M[0] is the SP select: * 0: always SP0 * 1: current ELs SP */ #define PSR_M_EL0t 0x00000000 #define PSR_M_EL1t 0x00000004 #define PSR_M_EL1h 0x00000005 #define PSR_M_EL2t 0x00000008 #define PSR_M_EL2h 0x00000009 #define PSR_M_MASK 0x0000001f #define PSR_F 0x00000040 #define PSR_I 0x00000080 #define PSR_A 0x00000100 #define PSR_D 0x00000200 #define PSR_IL 0x00100000 #define PSR_SS 0x00200000 #define PSR_V 0x10000000 #define PSR_C 0x20000000 #define PSR_Z 0x40000000 #define PSR_N 0x80000000 +#define PSR_FLAGS 0xf0000000 /* TCR_EL1 - Translation Control Register */ #define TCR_ASID_16 (1 << 36) #define TCR_IPS_SHIFT 32 #define TCR_IPS_32BIT (0 << TCR_IPS_SHIFT) #define TCR_IPS_36BIT (1 << TCR_IPS_SHIFT) #define TCR_IPS_40BIT (2 << TCR_IPS_SHIFT) #define TCR_IPS_42BIT (3 << TCR_IPS_SHIFT) #define TCR_IPS_44BIT (4 << TCR_IPS_SHIFT) #define TCR_IPS_48BIT (5 << TCR_IPS_SHIFT) #define TCR_TG1_SHIFT 30 #define TCR_TG1_16K (1 << TCR_TG1_SHIFT) #define TCR_TG1_4K (2 << TCR_TG1_SHIFT) #define TCR_TG1_64K (3 << TCR_TG1_SHIFT) #define TCR_SH1_SHIFT 28 #define TCR_SH1_IS (0x3UL << TCR_SH1_SHIFT) #define TCR_ORGN1_SHIFT 26 #define TCR_ORGN1_WBWA (0x1UL << TCR_ORGN1_SHIFT) #define TCR_IRGN1_SHIFT 24 #define TCR_IRGN1_WBWA (0x1UL << TCR_IRGN1_SHIFT) #define TCR_SH0_SHIFT 12 #define TCR_SH0_IS (0x3UL << TCR_SH0_SHIFT) #define TCR_ORGN0_SHIFT 10 #define TCR_ORGN0_WBWA (0x1UL << TCR_ORGN0_SHIFT) #define TCR_IRGN0_SHIFT 8 #define TCR_IRGN0_WBWA (0x1UL << TCR_IRGN0_SHIFT) #define TCR_CACHE_ATTRS ((TCR_IRGN0_WBWA | TCR_IRGN1_WBWA) |\ (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)) #ifdef SMP #define TCR_SMP_ATTRS (TCR_SH0_IS | TCR_SH1_IS) #else #define TCR_SMP_ATTRS 0 #endif #define TCR_T1SZ_SHIFT 16 #define TCR_T0SZ_SHIFT 0 #define TCR_T1SZ(x) ((x) << TCR_T1SZ_SHIFT) #define TCR_T0SZ(x) ((x) << TCR_T0SZ_SHIFT) #define TCR_TxSZ(x) (TCR_T1SZ(x) | TCR_T0SZ(x)) /* Saved Program Status Register */ #define DBG_SPSR_SS (0x1 << 21) /* Monitor Debug System Control Register */ #define DBG_MDSCR_SS (0x1 << 0) #define DBG_MDSCR_KDE (0x1 << 13) #define DBG_MDSCR_MDE (0x1 << 15) /* Perfomance Monitoring Counters */ #define PMCR_E (1 << 0) /* Enable all counters */ #define PMCR_P (1 << 1) /* Reset all counters */ #define PMCR_C (1 << 2) /* Clock counter reset */ #define PMCR_D (1 << 3) /* CNTR counts every 64 clk cycles */ #define PMCR_X (1 << 4) /* Export to ext. monitoring (ETM) */ #define PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define PMCR_LC (1 << 6) /* Long cycle count enable */ #define PMCR_IMP_SHIFT 24 /* Implementer code */ #define PMCR_IMP_MASK (0xff << PMCR_IMP_SHIFT) #define PMCR_IDCODE_SHIFT 16 /* Identification code */ #define PMCR_IDCODE_MASK (0xff << PMCR_IDCODE_SHIFT) #define PMCR_IDCODE_CORTEX_A57 0x01 #define PMCR_IDCODE_CORTEX_A72 0x02 #define PMCR_IDCODE_CORTEX_A53 0x03 #define PMCR_N_SHIFT 11 /* Number of counters implemented */ #define PMCR_N_MASK (0x1f << PMCR_N_SHIFT) #endif /* !_MACHINE_ARMREG_H_ */ Index: projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c =================================================================== --- projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c (revision 326162) @@ -1,2697 +1,2691 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Portions Copyright 2010 The FreeBSD Foundation * * $FreeBSD$ */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #ifdef illumos #include #endif #include #include #include #ifdef illumos #include #endif #include #include #include #include #include #include #include #include #ifdef illumos #include #endif #include #include #ifndef illumos #include #include #include #include #include #include #include #include #include #include #include #include #endif /* * User-Land Trap-Based Tracing * ---------------------------- * * The fasttrap provider allows DTrace consumers to instrument any user-level * instruction to gather data; this includes probes with semantic * signifigance like entry and return as well as simple offsets into the * function. While the specific techniques used are very ISA specific, the * methodology is generalizable to any architecture. * * * The General Methodology * ----------------------- * * With the primary goal of tracing every user-land instruction and the * limitation that we can't trust user space so don't want to rely on much * information there, we begin by replacing the instructions we want to trace * with trap instructions. Each instruction we overwrite is saved into a hash * table keyed by process ID and pc address. When we enter the kernel due to * this trap instruction, we need the effects of the replaced instruction to * appear to have occurred before we proceed with the user thread's * execution. * * Each user level thread is represented by a ulwp_t structure which is * always easily accessible through a register. The most basic way to produce * the effects of the instruction we replaced is to copy that instruction out * to a bit of scratch space reserved in the user thread's ulwp_t structure * (a sort of kernel-private thread local storage), set the PC to that * scratch space and single step. When we reenter the kernel after single * stepping the instruction we must then adjust the PC to point to what would * normally be the next instruction. Of course, special care must be taken * for branches and jumps, but these represent such a small fraction of any * instruction set that writing the code to emulate these in the kernel is * not too difficult. * * Return probes may require several tracepoints to trace every return site, * and, conversely, each tracepoint may activate several probes (the entry * and offset 0 probes, for example). To solve this muliplexing problem, * tracepoints contain lists of probes to activate and probes contain lists * of tracepoints to enable. If a probe is activated, it adds its ID to * existing tracepoints or creates new ones as necessary. * * Most probes are activated _before_ the instruction is executed, but return * probes are activated _after_ the effects of the last instruction of the * function are visible. Return probes must be fired _after_ we have * single-stepped the instruction whereas all other probes are fired * beforehand. * * * Lock Ordering * ------------- * * The lock ordering below -- both internally and with respect to the DTrace * framework -- is a little tricky and bears some explanation. Each provider * has a lock (ftp_mtx) that protects its members including reference counts * for enabled probes (ftp_rcount), consumers actively creating probes * (ftp_ccount) and USDT consumers (ftp_mcount); all three prevent a provider * from being freed. A provider is looked up by taking the bucket lock for the * provider hash table, and is returned with its lock held. The provider lock * may be taken in functions invoked by the DTrace framework, but may not be * held while calling functions in the DTrace framework. * * To ensure consistency over multiple calls to the DTrace framework, the * creation lock (ftp_cmtx) should be held. Naturally, the creation lock may * not be taken when holding the provider lock as that would create a cyclic * lock ordering. In situations where one would naturally take the provider * lock and then the creation lock, we instead up a reference count to prevent * the provider from disappearing, drop the provider lock, and acquire the * creation lock. * * Briefly: * bucket lock before provider lock * DTrace before provider lock * creation lock before DTrace * never hold the provider lock and creation lock simultaneously */ static d_open_t fasttrap_open; static d_ioctl_t fasttrap_ioctl; static struct cdevsw fasttrap_cdevsw = { .d_version = D_VERSION, .d_open = fasttrap_open, .d_ioctl = fasttrap_ioctl, .d_name = "fasttrap", }; static struct cdev *fasttrap_cdev; static dtrace_meta_provider_id_t fasttrap_meta_id; static struct proc *fasttrap_cleanup_proc; static struct mtx fasttrap_cleanup_mtx; static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv; /* * Generation count on modifications to the global tracepoint lookup table. */ static volatile uint64_t fasttrap_mod_gen; /* * When the fasttrap provider is loaded, fasttrap_max is set to either * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD). * Each time a probe is created, fasttrap_total is incremented by the number * of tracepoints that may be associated with that probe; fasttrap_total is capped * at fasttrap_max. */ #define FASTTRAP_MAX_DEFAULT 250000 static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT; static uint32_t fasttrap_total; /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #define FASTTRAP_TPOINTS_DEFAULT_SIZE 0x4000 #define FASTTRAP_PROVIDERS_DEFAULT_SIZE 0x100 #define FASTTRAP_PROCS_DEFAULT_SIZE 0x100 #define FASTTRAP_PID_NAME "pid" fasttrap_hash_t fasttrap_tpoints; static fasttrap_hash_t fasttrap_provs; static fasttrap_hash_t fasttrap_procs; static uint64_t fasttrap_pid_count; /* pid ref count */ static kmutex_t fasttrap_count_mtx; /* lock on ref count */ #define FASTTRAP_ENABLE_FAIL 1 #define FASTTRAP_ENABLE_PARTIAL 2 static int fasttrap_tracepoint_enable(proc_t *, fasttrap_probe_t *, uint_t); static void fasttrap_tracepoint_disable(proc_t *, fasttrap_probe_t *, uint_t); static fasttrap_provider_t *fasttrap_provider_lookup(pid_t, const char *, const dtrace_pattr_t *); static void fasttrap_provider_retire(pid_t, const char *, int); static void fasttrap_provider_free(fasttrap_provider_t *); static fasttrap_proc_t *fasttrap_proc_lookup(pid_t); static void fasttrap_proc_release(fasttrap_proc_t *); #ifndef illumos static void fasttrap_thread_dtor(void *, struct thread *); #endif #define FASTTRAP_PROVS_INDEX(pid, name) \ ((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask) #define FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask) #ifndef illumos struct rmlock fasttrap_tp_lock; static eventhandler_tag fasttrap_thread_dtor_tag; #endif static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE; #ifdef __FreeBSD__ SYSCTL_DECL(_kern_dtrace); SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters"); SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max, FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes"); SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size, FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table"); #endif static int fasttrap_highbit(ulong_t i) { int h = 1; if (i == 0) return (0); #ifdef _LP64 if (i & 0xffffffff00000000ul) { h += 32; i >>= 32; } #endif if (i & 0xffff0000) { h += 16; i >>= 16; } if (i & 0xff00) { h += 8; i >>= 8; } if (i & 0xf0) { h += 4; i >>= 4; } if (i & 0xc) { h += 2; i >>= 2; } if (i & 0x2) { h += 1; } return (h); } static uint_t fasttrap_hash_str(const char *p) { unsigned int g; uint_t hval = 0; while (*p) { hval = (hval << 4) + *p++; if ((g = (hval & 0xf0000000)) != 0) hval ^= g >> 24; hval &= ~g; } return (hval); } void fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc) { #ifdef illumos sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); sqp->sq_info.si_signo = SIGTRAP; sqp->sq_info.si_code = TRAP_DTRACE; sqp->sq_info.si_addr = (caddr_t)pc; mutex_enter(&p->p_lock); sigaddqa(p, t, sqp); mutex_exit(&p->p_lock); if (t != NULL) aston(t); #else ksiginfo_t *ksi = kmem_zalloc(sizeof (ksiginfo_t), KM_SLEEP); ksiginfo_init(ksi); ksi->ksi_signo = SIGTRAP; ksi->ksi_code = TRAP_DTRACE; ksi->ksi_addr = (caddr_t)pc; PROC_LOCK(p); (void) tdsendsignal(p, t, SIGTRAP, ksi); PROC_UNLOCK(p); #endif } #ifndef illumos /* * Obtain a chunk of scratch space in the address space of the target process. */ fasttrap_scrspace_t * fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc) { fasttrap_scrblock_t *scrblk; fasttrap_scrspace_t *scrspc; struct proc *p; vm_offset_t addr; int error, i; scrspc = NULL; if (td->t_dtrace_sscr != NULL) { /* If the thread already has scratch space, we're done. */ scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr; return (scrspc); } p = td->td_proc; mutex_enter(&fprc->ftpc_mtx); if (LIST_EMPTY(&fprc->ftpc_fscr)) { /* * No scratch space is available, so we'll map a new scratch * space block into the traced process' address space. */ addr = 0; error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) goto done; scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK); scrblk->ftsb_addr = addr; LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next); /* * Carve the block up into chunks and put them on the free list. */ for (i = 0; i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) { scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK); scrspc->ftss_addr = addr + i * FASTTRAP_SCRSPACE_SIZE; LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next); } } /* * Take the first scratch chunk off the free list, put it on the * allocated list, and return its address. */ scrspc = LIST_FIRST(&fprc->ftpc_fscr); LIST_REMOVE(scrspc, ftss_next); LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next); /* * This scratch space is reserved for use by td until the thread exits. */ td->t_dtrace_sscr = scrspc; done: mutex_exit(&fprc->ftpc_mtx); return (scrspc); } /* * Return any allocated per-thread scratch space chunks back to the process' * free list. */ static void fasttrap_thread_dtor(void *arg __unused, struct thread *td) { fasttrap_bucket_t *bucket; fasttrap_proc_t *fprc; fasttrap_scrspace_t *scrspc; pid_t pid; if (td->t_dtrace_sscr == NULL) return; pid = td->td_proc->p_pid; bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; fprc = NULL; /* Look up the fasttrap process handle for this process. */ mutex_enter(&bucket->ftb_mtx); for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { if (fprc->ftpc_pid == pid) { mutex_enter(&fprc->ftpc_mtx); mutex_exit(&bucket->ftb_mtx); break; } } if (fprc == NULL) { mutex_exit(&bucket->ftb_mtx); return; } scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr; LIST_REMOVE(scrspc, ftss_next); LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next); mutex_exit(&fprc->ftpc_mtx); } #endif /* * This function ensures that no threads are actively using the memory * associated with probes that were formerly live. */ static void fasttrap_mod_barrier(uint64_t gen) { int i; if (gen < fasttrap_mod_gen) return; fasttrap_mod_gen++; #ifdef illumos CPU_FOREACH(i) { mutex_enter(&fasttrap_cpuc_pid_lock[i]); mutex_exit(&fasttrap_cpuc_pid_lock[i]); } #else rm_wlock(&fasttrap_tp_lock); rm_wunlock(&fasttrap_tp_lock); #endif } /* * This function performs asynchronous cleanup of fasttrap providers. The * Solaris implementation of this mechanism use a timeout that's activated in * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler. * Thus we use a dedicated process to perform the cleanup when requested. */ /*ARGSUSED*/ static void fasttrap_pid_cleanup_cb(void *data) { fasttrap_provider_t **fpp, *fp; fasttrap_bucket_t *bucket; dtrace_provider_id_t provid; int i, later = 0, rval; mtx_lock(&fasttrap_cleanup_mtx); while (!fasttrap_cleanup_drain || later > 0) { fasttrap_cleanup_work = 0; mtx_unlock(&fasttrap_cleanup_mtx); later = 0; /* * Iterate over all the providers trying to remove the marked * ones. If a provider is marked but not retired, we just * have to take a crack at removing it -- it's no big deal if * we can't. */ for (i = 0; i < fasttrap_provs.fth_nent; i++) { bucket = &fasttrap_provs.fth_table[i]; mutex_enter(&bucket->ftb_mtx); fpp = (fasttrap_provider_t **)&bucket->ftb_data; while ((fp = *fpp) != NULL) { if (!fp->ftp_marked) { fpp = &fp->ftp_next; continue; } mutex_enter(&fp->ftp_mtx); /* * If this provider has consumers actively * creating probes (ftp_ccount) or is a USDT * provider (ftp_mcount), we can't unregister * or even condense. */ if (fp->ftp_ccount != 0 || fp->ftp_mcount != 0) { mutex_exit(&fp->ftp_mtx); fp->ftp_marked = 0; continue; } if (!fp->ftp_retired || fp->ftp_rcount != 0) fp->ftp_marked = 0; mutex_exit(&fp->ftp_mtx); /* * If we successfully unregister this * provider we can remove it from the hash * chain and free the memory. If our attempt * to unregister fails and this is a retired * provider, increment our flag to try again * pretty soon. If we've consumed more than * half of our total permitted number of * probes call dtrace_condense() to try to * clean out the unenabled probes. */ provid = fp->ftp_provid; if ((rval = dtrace_unregister(provid)) != 0) { if (fasttrap_total > fasttrap_max / 2) (void) dtrace_condense(provid); if (rval == EAGAIN) fp->ftp_marked = 1; later += fp->ftp_marked; fpp = &fp->ftp_next; } else { *fpp = fp->ftp_next; fasttrap_provider_free(fp); } } mutex_exit(&bucket->ftb_mtx); } mtx_lock(&fasttrap_cleanup_mtx); /* * If we were unable to retire a provider, try again after a * second. This situation can occur in certain circumstances * where providers cannot be unregistered even though they have * no probes enabled because of an execution of dtrace -l or * something similar. */ if (later > 0 || fasttrap_cleanup_work || fasttrap_cleanup_drain) { mtx_unlock(&fasttrap_cleanup_mtx); pause("ftclean", hz); mtx_lock(&fasttrap_cleanup_mtx); } else mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx, 0, "ftcl", 0); } /* * Wake up the thread in fasttrap_unload() now that we're done. */ wakeup(&fasttrap_cleanup_drain); mtx_unlock(&fasttrap_cleanup_mtx); kthread_exit(); } /* * Activates the asynchronous cleanup mechanism. */ static void fasttrap_pid_cleanup(void) { mtx_lock(&fasttrap_cleanup_mtx); if (!fasttrap_cleanup_work) { fasttrap_cleanup_work = 1; wakeup(&fasttrap_cleanup_cv); } mtx_unlock(&fasttrap_cleanup_mtx); } /* * This is called from cfork() via dtrace_fasttrap_fork(). The child * process's address space is (roughly) a copy of the parent process's so * we have to remove all the instrumentation we had previously enabled in the * parent. */ static void fasttrap_fork(proc_t *p, proc_t *cp) { #ifndef illumos fasttrap_scrblock_t *scrblk; fasttrap_proc_t *fprc = NULL; #endif pid_t ppid = p->p_pid; int i; -#ifdef illumos ASSERT(curproc == p); +#ifdef illumos ASSERT(p->p_proc_flag & P_PR_LOCK); #else PROC_LOCK_ASSERT(p, MA_OWNED); #endif #ifdef illumos ASSERT(p->p_dtrace_count > 0); #else - if (p->p_dtrace_helpers) { - /* - * dtrace_helpers_duplicate() allocates memory. - */ - _PHOLD(cp); - PROC_UNLOCK(p); - PROC_UNLOCK(cp); - dtrace_helpers_duplicate(p, cp); - PROC_LOCK(cp); - PROC_LOCK(p); - _PRELE(cp); - } /* * This check is purposely here instead of in kern_fork.c because, * for legal resons, we cannot include the dtrace_cddl.h header * inside kern_fork.c and insert if-clause there. */ - if (p->p_dtrace_count == 0) + if (p->p_dtrace_count == 0 && p->p_dtrace_helpers == NULL) return; #endif + ASSERT(cp->p_dtrace_count == 0); /* * This would be simpler and faster if we maintained per-process * hash tables of enabled tracepoints. It could, however, potentially * slow down execution of a tracepoint since we'd need to go * through two levels of indirection. In the future, we should * consider either maintaining per-process ancillary lists of * enabled tracepoints or hanging a pointer to a per-process hash * table of enabled tracepoints off the proc structure. */ /* * We don't have to worry about the child process disappearing * because we're in fork(). */ #ifdef illumos mtx_lock_spin(&cp->p_slock); sprlock_proc(cp); mtx_unlock_spin(&cp->p_slock); #else /* * fasttrap_tracepoint_remove() expects the child process to be * unlocked and the VM then expects curproc to be unlocked. */ _PHOLD(cp); PROC_UNLOCK(cp); PROC_UNLOCK(p); + if (p->p_dtrace_count == 0) + goto dup_helpers; #endif /* * Iterate over every tracepoint looking for ones that belong to the * parent process, and remove each from the child process. */ for (i = 0; i < fasttrap_tpoints.fth_nent; i++) { fasttrap_tracepoint_t *tp; fasttrap_bucket_t *bucket = &fasttrap_tpoints.fth_table[i]; mutex_enter(&bucket->ftb_mtx); for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { if (tp->ftt_pid == ppid && tp->ftt_proc->ftpc_acount != 0) { int ret = fasttrap_tracepoint_remove(cp, tp); ASSERT(ret == 0); /* * The count of active providers can only be * decremented (i.e. to zero) during exec, * exit, and removal of a meta provider so it * should be impossible to drop the count * mid-fork. */ ASSERT(tp->ftt_proc->ftpc_acount != 0); #ifndef illumos fprc = tp->ftt_proc; #endif } } mutex_exit(&bucket->ftb_mtx); #ifndef illumos /* * Unmap any scratch space inherited from the parent's address * space. */ if (fprc != NULL) { mutex_enter(&fprc->ftpc_mtx); LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) { vm_map_remove(&cp->p_vmspace->vm_map, scrblk->ftsb_addr, scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE); } mutex_exit(&fprc->ftpc_mtx); } #endif } #ifdef illumos mutex_enter(&cp->p_lock); sprunlock(cp); #else +dup_helpers: + if (p->p_dtrace_helpers != NULL) + dtrace_helpers_duplicate(p, cp); PROC_LOCK(p); PROC_LOCK(cp); _PRELE(cp); #endif } /* * This is called from proc_exit() or from exec_common() if p_dtrace_probes * is set on the proc structure to indicate that there is a pid provider * associated with this process. */ static void fasttrap_exec_exit(proc_t *p) { #ifndef illumos struct thread *td; #endif #ifdef illumos ASSERT(p == curproc); #else PROC_LOCK_ASSERT(p, MA_OWNED); _PHOLD(p); /* * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it * ourselves when a process exits. */ FOREACH_THREAD_IN_PROC(p, td) td->t_dtrace_sscr = NULL; PROC_UNLOCK(p); #endif /* * We clean up the pid provider for this process here; user-land * static probes are handled by the meta-provider remove entry point. */ fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0); #ifndef illumos if (p->p_dtrace_helpers) dtrace_helpers_destroy(p); PROC_LOCK(p); _PRELE(p); #endif } /*ARGSUSED*/ static void fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc) { /* * There are no "default" pid probes. */ } static int fasttrap_tracepoint_enable(proc_t *p, fasttrap_probe_t *probe, uint_t index) { fasttrap_tracepoint_t *tp, *new_tp = NULL; fasttrap_bucket_t *bucket; fasttrap_id_t *id; pid_t pid; uintptr_t pc; ASSERT(index < probe->ftp_ntps); pid = probe->ftp_pid; pc = probe->ftp_tps[index].fit_tp->ftt_pc; id = &probe->ftp_tps[index].fit_id; ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid); #ifdef illumos ASSERT(!(p->p_flag & SVFORK)); #endif /* * Before we make any modifications, make sure we've imposed a barrier * on the generation in which this probe was last modified. */ fasttrap_mod_barrier(probe->ftp_gen); bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; /* * If the tracepoint has already been enabled, just add our id to the * list of interested probes. This may be our second time through * this path in which case we'll have constructed the tracepoint we'd * like to install. If we can't find a match, and have an allocated * tracepoint ready to go, enable that one now. * * A tracepoint whose process is defunct is also considered defunct. */ again: mutex_enter(&bucket->ftb_mtx); for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { /* * Note that it's safe to access the active count on the * associated proc structure because we know that at least one * provider (this one) will still be around throughout this * operation. */ if (tp->ftt_pid != pid || tp->ftt_pc != pc || tp->ftt_proc->ftpc_acount == 0) continue; /* * Now that we've found a matching tracepoint, it would be * a decent idea to confirm that the tracepoint is still * enabled and the trap instruction hasn't been overwritten. * Since this is a little hairy, we'll punt for now. */ /* * This can't be the first interested probe. We don't have * to worry about another thread being in the midst of * deleting this tracepoint (which would be the only valid * reason for a tracepoint to have no interested probes) * since we're holding P_PR_LOCK for this process. */ ASSERT(tp->ftt_ids != NULL || tp->ftt_retids != NULL); switch (id->fti_ptype) { case DTFTP_ENTRY: case DTFTP_OFFSETS: case DTFTP_IS_ENABLED: id->fti_next = tp->ftt_ids; membar_producer(); tp->ftt_ids = id; membar_producer(); break; case DTFTP_RETURN: case DTFTP_POST_OFFSETS: id->fti_next = tp->ftt_retids; membar_producer(); tp->ftt_retids = id; membar_producer(); break; default: ASSERT(0); } mutex_exit(&bucket->ftb_mtx); if (new_tp != NULL) { new_tp->ftt_ids = NULL; new_tp->ftt_retids = NULL; } return (0); } /* * If we have a good tracepoint ready to go, install it now while * we have the lock held and no one can screw with us. */ if (new_tp != NULL) { int rc = 0; new_tp->ftt_next = bucket->ftb_data; membar_producer(); bucket->ftb_data = new_tp; membar_producer(); mutex_exit(&bucket->ftb_mtx); /* * Activate the tracepoint in the ISA-specific manner. * If this fails, we need to report the failure, but * indicate that this tracepoint must still be disabled * by calling fasttrap_tracepoint_disable(). */ if (fasttrap_tracepoint_install(p, new_tp) != 0) rc = FASTTRAP_ENABLE_PARTIAL; /* * Increment the count of the number of tracepoints active in * the victim process. */ #ifdef illumos ASSERT(p->p_proc_flag & P_PR_LOCK); #endif p->p_dtrace_count++; return (rc); } mutex_exit(&bucket->ftb_mtx); /* * Initialize the tracepoint that's been preallocated with the probe. */ new_tp = probe->ftp_tps[index].fit_tp; ASSERT(new_tp->ftt_pid == pid); ASSERT(new_tp->ftt_pc == pc); ASSERT(new_tp->ftt_proc == probe->ftp_prov->ftp_proc); ASSERT(new_tp->ftt_ids == NULL); ASSERT(new_tp->ftt_retids == NULL); switch (id->fti_ptype) { case DTFTP_ENTRY: case DTFTP_OFFSETS: case DTFTP_IS_ENABLED: id->fti_next = NULL; new_tp->ftt_ids = id; break; case DTFTP_RETURN: case DTFTP_POST_OFFSETS: id->fti_next = NULL; new_tp->ftt_retids = id; break; default: ASSERT(0); } #ifdef __FreeBSD__ if (SV_PROC_FLAG(p, SV_LP64)) p->p_model = DATAMODEL_LP64; else p->p_model = DATAMODEL_ILP32; #endif /* * If the ISA-dependent initialization goes to plan, go back to the * beginning and try to install this freshly made tracepoint. */ if (fasttrap_tracepoint_init(p, new_tp, pc, id->fti_ptype) == 0) goto again; new_tp->ftt_ids = NULL; new_tp->ftt_retids = NULL; return (FASTTRAP_ENABLE_FAIL); } static void fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index) { fasttrap_bucket_t *bucket; fasttrap_provider_t *provider = probe->ftp_prov; fasttrap_tracepoint_t **pp, *tp; fasttrap_id_t *id, **idp = NULL; pid_t pid; uintptr_t pc; ASSERT(index < probe->ftp_ntps); pid = probe->ftp_pid; pc = probe->ftp_tps[index].fit_tp->ftt_pc; id = &probe->ftp_tps[index].fit_id; ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid); /* * Find the tracepoint and make sure that our id is one of the * ones registered with it. */ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; mutex_enter(&bucket->ftb_mtx); for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { if (tp->ftt_pid == pid && tp->ftt_pc == pc && tp->ftt_proc == provider->ftp_proc) break; } /* * If we somehow lost this tracepoint, we're in a world of hurt. */ ASSERT(tp != NULL); switch (id->fti_ptype) { case DTFTP_ENTRY: case DTFTP_OFFSETS: case DTFTP_IS_ENABLED: ASSERT(tp->ftt_ids != NULL); idp = &tp->ftt_ids; break; case DTFTP_RETURN: case DTFTP_POST_OFFSETS: ASSERT(tp->ftt_retids != NULL); idp = &tp->ftt_retids; break; default: ASSERT(0); } while ((*idp)->fti_probe != probe) { idp = &(*idp)->fti_next; ASSERT(*idp != NULL); } id = *idp; *idp = id->fti_next; membar_producer(); ASSERT(id->fti_probe == probe); /* * If there are other registered enablings of this tracepoint, we're * all done, but if this was the last probe assocated with this * this tracepoint, we need to remove and free it. */ if (tp->ftt_ids != NULL || tp->ftt_retids != NULL) { /* * If the current probe's tracepoint is in use, swap it * for an unused tracepoint. */ if (tp == probe->ftp_tps[index].fit_tp) { fasttrap_probe_t *tmp_probe; fasttrap_tracepoint_t **tmp_tp; uint_t tmp_index; if (tp->ftt_ids != NULL) { tmp_probe = tp->ftt_ids->fti_probe; /* LINTED - alignment */ tmp_index = FASTTRAP_ID_INDEX(tp->ftt_ids); tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp; } else { tmp_probe = tp->ftt_retids->fti_probe; /* LINTED - alignment */ tmp_index = FASTTRAP_ID_INDEX(tp->ftt_retids); tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp; } ASSERT(*tmp_tp != NULL); ASSERT(*tmp_tp != probe->ftp_tps[index].fit_tp); ASSERT((*tmp_tp)->ftt_ids == NULL); ASSERT((*tmp_tp)->ftt_retids == NULL); probe->ftp_tps[index].fit_tp = *tmp_tp; *tmp_tp = tp; } mutex_exit(&bucket->ftb_mtx); /* * Tag the modified probe with the generation in which it was * changed. */ probe->ftp_gen = fasttrap_mod_gen; return; } mutex_exit(&bucket->ftb_mtx); /* * We can't safely remove the tracepoint from the set of active * tracepoints until we've actually removed the fasttrap instruction * from the process's text. We can, however, operate on this * tracepoint secure in the knowledge that no other thread is going to * be looking at it since we hold P_PR_LOCK on the process if it's * live or we hold the provider lock on the process if it's dead and * gone. */ /* * We only need to remove the actual instruction if we're looking * at an existing process */ if (p != NULL) { /* * If we fail to restore the instruction we need to kill * this process since it's in a completely unrecoverable * state. */ if (fasttrap_tracepoint_remove(p, tp) != 0) fasttrap_sigtrap(p, NULL, pc); /* * Decrement the count of the number of tracepoints active * in the victim process. */ #ifdef illumos ASSERT(p->p_proc_flag & P_PR_LOCK); #endif p->p_dtrace_count--; } /* * Remove the probe from the hash table of active tracepoints. */ mutex_enter(&bucket->ftb_mtx); pp = (fasttrap_tracepoint_t **)&bucket->ftb_data; ASSERT(*pp != NULL); while (*pp != tp) { pp = &(*pp)->ftt_next; ASSERT(*pp != NULL); } *pp = tp->ftt_next; membar_producer(); mutex_exit(&bucket->ftb_mtx); /* * Tag the modified probe with the generation in which it was changed. */ probe->ftp_gen = fasttrap_mod_gen; } static void fasttrap_enable_callbacks(void) { /* * We don't have to play the rw lock game here because we're * providing something rather than taking something away -- * we can be sure that no threads have tried to follow this * function pointer yet. */ mutex_enter(&fasttrap_count_mtx); if (fasttrap_pid_count == 0) { ASSERT(dtrace_pid_probe_ptr == NULL); ASSERT(dtrace_return_probe_ptr == NULL); dtrace_pid_probe_ptr = &fasttrap_pid_probe; dtrace_return_probe_ptr = &fasttrap_return_probe; } ASSERT(dtrace_pid_probe_ptr == &fasttrap_pid_probe); ASSERT(dtrace_return_probe_ptr == &fasttrap_return_probe); fasttrap_pid_count++; mutex_exit(&fasttrap_count_mtx); } static void fasttrap_disable_callbacks(void) { #ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); #endif mutex_enter(&fasttrap_count_mtx); ASSERT(fasttrap_pid_count > 0); fasttrap_pid_count--; if (fasttrap_pid_count == 0) { #ifdef illumos cpu_t *cur, *cpu = CPU; for (cur = cpu->cpu_next_onln; cur != cpu; cur = cur->cpu_next_onln) { rw_enter(&cur->cpu_ft_lock, RW_WRITER); } #endif dtrace_pid_probe_ptr = NULL; dtrace_return_probe_ptr = NULL; #ifdef illumos for (cur = cpu->cpu_next_onln; cur != cpu; cur = cur->cpu_next_onln) { rw_exit(&cur->cpu_ft_lock); } #endif } mutex_exit(&fasttrap_count_mtx); } /*ARGSUSED*/ static void fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) { fasttrap_probe_t *probe = parg; proc_t *p = NULL; int i, rc; ASSERT(probe != NULL); ASSERT(!probe->ftp_enabled); ASSERT(id == probe->ftp_id); #ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); #endif /* * Increment the count of enabled probes on this probe's provider; * the provider can't go away while the probe still exists. We * must increment this even if we aren't able to properly enable * this probe. */ mutex_enter(&probe->ftp_prov->ftp_mtx); probe->ftp_prov->ftp_rcount++; mutex_exit(&probe->ftp_prov->ftp_mtx); /* * If this probe's provider is retired (meaning it was valid in a * previously exec'ed incarnation of this address space), bail out. The * provider can't go away while we're in this code path. */ if (probe->ftp_prov->ftp_retired) return; /* * If we can't find the process, it may be that we're in the context of * a fork in which the traced process is being born and we're copying * USDT probes. Otherwise, the process is gone so bail. */ #ifdef illumos if ((p = sprlock(probe->ftp_pid)) == NULL) { if ((curproc->p_flag & SFORKING) == 0) return; mutex_enter(&pidlock); p = prfind(probe->ftp_pid); if (p == NULL) { /* * So it's not that the target process is being born, * it's that it isn't there at all (and we simply * happen to be forking). Anyway, we know that the * target is definitely gone, so bail out. */ mutex_exit(&pidlock); return (0); } /* * Confirm that curproc is indeed forking the process in which * we're trying to enable probes. */ ASSERT(p->p_parent == curproc); ASSERT(p->p_stat == SIDL); mutex_enter(&p->p_lock); mutex_exit(&pidlock); sprlock_proc(p); } ASSERT(!(p->p_flag & SVFORK)); mutex_exit(&p->p_lock); #else if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0) return; #endif /* * We have to enable the trap entry point before any user threads have * the chance to execute the trap instruction we're about to place * in their process's text. */ fasttrap_enable_callbacks(); /* * Enable all the tracepoints and add this probe's id to each * tracepoint's list of active probes. */ for (i = 0; i < probe->ftp_ntps; i++) { if ((rc = fasttrap_tracepoint_enable(p, probe, i)) != 0) { /* * If enabling the tracepoint failed completely, * we don't have to disable it; if the failure * was only partial we must disable it. */ if (rc == FASTTRAP_ENABLE_FAIL) i--; else ASSERT(rc == FASTTRAP_ENABLE_PARTIAL); /* * Back up and pull out all the tracepoints we've * created so far for this probe. */ while (i >= 0) { fasttrap_tracepoint_disable(p, probe, i); i--; } #ifdef illumos mutex_enter(&p->p_lock); sprunlock(p); #else PRELE(p); #endif /* * Since we're not actually enabling this probe, * drop our reference on the trap table entry. */ fasttrap_disable_callbacks(); return; } } #ifdef illumos mutex_enter(&p->p_lock); sprunlock(p); #else PRELE(p); #endif probe->ftp_enabled = 1; } /*ARGSUSED*/ static void fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg) { fasttrap_probe_t *probe = parg; fasttrap_provider_t *provider = probe->ftp_prov; proc_t *p; int i, whack = 0; ASSERT(id == probe->ftp_id); mutex_enter(&provider->ftp_mtx); /* * We won't be able to acquire a /proc-esque lock on the process * iff the process is dead and gone. In this case, we rely on the * provider lock as a point of mutual exclusion to prevent other * DTrace consumers from disabling this probe. */ if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0) p = NULL; /* * Disable all the associated tracepoints (for fully enabled probes). */ if (probe->ftp_enabled) { for (i = 0; i < probe->ftp_ntps; i++) { fasttrap_tracepoint_disable(p, probe, i); } } ASSERT(provider->ftp_rcount > 0); provider->ftp_rcount--; if (p != NULL) { /* * Even though we may not be able to remove it entirely, we * mark this retired provider to get a chance to remove some * of the associated probes. */ if (provider->ftp_retired && !provider->ftp_marked) whack = provider->ftp_marked = 1; mutex_exit(&provider->ftp_mtx); } else { /* * If the process is dead, we're just waiting for the * last probe to be disabled to be able to free it. */ if (provider->ftp_rcount == 0 && !provider->ftp_marked) whack = provider->ftp_marked = 1; mutex_exit(&provider->ftp_mtx); } if (whack) fasttrap_pid_cleanup(); #ifdef __FreeBSD__ if (p != NULL) PRELE(p); #endif if (!probe->ftp_enabled) return; probe->ftp_enabled = 0; #ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); #endif fasttrap_disable_callbacks(); } /*ARGSUSED*/ static void fasttrap_pid_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc) { fasttrap_probe_t *probe = parg; char *str; int i, ndx; desc->dtargd_native[0] = '\0'; desc->dtargd_xlate[0] = '\0'; if (probe->ftp_prov->ftp_retired != 0 || desc->dtargd_ndx >= probe->ftp_nargs) { desc->dtargd_ndx = DTRACE_ARGNONE; return; } ndx = (probe->ftp_argmap != NULL) ? probe->ftp_argmap[desc->dtargd_ndx] : desc->dtargd_ndx; str = probe->ftp_ntypes; for (i = 0; i < ndx; i++) { str += strlen(str) + 1; } ASSERT(strlen(str + 1) < sizeof (desc->dtargd_native)); (void) strcpy(desc->dtargd_native, str); if (probe->ftp_xtypes == NULL) return; str = probe->ftp_xtypes; for (i = 0; i < desc->dtargd_ndx; i++) { str += strlen(str) + 1; } ASSERT(strlen(str + 1) < sizeof (desc->dtargd_xlate)); (void) strcpy(desc->dtargd_xlate, str); } /*ARGSUSED*/ static void fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg) { fasttrap_probe_t *probe = parg; int i; size_t size; ASSERT(probe != NULL); ASSERT(!probe->ftp_enabled); ASSERT(fasttrap_total >= probe->ftp_ntps); atomic_add_32(&fasttrap_total, -probe->ftp_ntps); size = offsetof(fasttrap_probe_t, ftp_tps[probe->ftp_ntps]); if (probe->ftp_gen + 1 >= fasttrap_mod_gen) fasttrap_mod_barrier(probe->ftp_gen); for (i = 0; i < probe->ftp_ntps; i++) { kmem_free(probe->ftp_tps[i].fit_tp, sizeof (fasttrap_tracepoint_t)); } kmem_free(probe, size); } static const dtrace_pattr_t pid_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, }; static dtrace_pops_t pid_pops = { .dtps_provide = fasttrap_pid_provide, .dtps_provide_module = NULL, .dtps_enable = fasttrap_pid_enable, .dtps_disable = fasttrap_pid_disable, .dtps_suspend = NULL, .dtps_resume = NULL, .dtps_getargdesc = fasttrap_pid_getargdesc, .dtps_getargval = fasttrap_pid_getarg, .dtps_usermode = NULL, .dtps_destroy = fasttrap_pid_destroy }; static dtrace_pops_t usdt_pops = { .dtps_provide = fasttrap_pid_provide, .dtps_provide_module = NULL, .dtps_enable = fasttrap_pid_enable, .dtps_disable = fasttrap_pid_disable, .dtps_suspend = NULL, .dtps_resume = NULL, .dtps_getargdesc = fasttrap_pid_getargdesc, .dtps_getargval = fasttrap_usdt_getarg, .dtps_usermode = NULL, .dtps_destroy = fasttrap_pid_destroy }; static fasttrap_proc_t * fasttrap_proc_lookup(pid_t pid) { fasttrap_bucket_t *bucket; fasttrap_proc_t *fprc, *new_fprc; bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; mutex_enter(&bucket->ftb_mtx); for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) { mutex_enter(&fprc->ftpc_mtx); mutex_exit(&bucket->ftb_mtx); fprc->ftpc_rcount++; atomic_inc_64(&fprc->ftpc_acount); ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); mutex_exit(&fprc->ftpc_mtx); return (fprc); } } /* * Drop the bucket lock so we don't try to perform a sleeping * allocation under it. */ mutex_exit(&bucket->ftb_mtx); new_fprc = kmem_zalloc(sizeof (fasttrap_proc_t), KM_SLEEP); new_fprc->ftpc_pid = pid; new_fprc->ftpc_rcount = 1; new_fprc->ftpc_acount = 1; #ifndef illumos mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT, NULL); #endif mutex_enter(&bucket->ftb_mtx); /* * Take another lap through the list to make sure a proc hasn't * been created for this pid while we weren't under the bucket lock. */ for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) { mutex_enter(&fprc->ftpc_mtx); mutex_exit(&bucket->ftb_mtx); fprc->ftpc_rcount++; atomic_inc_64(&fprc->ftpc_acount); ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); mutex_exit(&fprc->ftpc_mtx); kmem_free(new_fprc, sizeof (fasttrap_proc_t)); return (fprc); } } new_fprc->ftpc_next = bucket->ftb_data; bucket->ftb_data = new_fprc; mutex_exit(&bucket->ftb_mtx); return (new_fprc); } static void fasttrap_proc_release(fasttrap_proc_t *proc) { fasttrap_bucket_t *bucket; fasttrap_proc_t *fprc, **fprcp; pid_t pid = proc->ftpc_pid; #ifndef illumos fasttrap_scrblock_t *scrblk, *scrblktmp; fasttrap_scrspace_t *scrspc, *scrspctmp; struct proc *p; struct thread *td; #endif mutex_enter(&proc->ftpc_mtx); ASSERT(proc->ftpc_rcount != 0); ASSERT(proc->ftpc_acount <= proc->ftpc_rcount); if (--proc->ftpc_rcount != 0) { mutex_exit(&proc->ftpc_mtx); return; } #ifndef illumos /* * Free all structures used to manage per-thread scratch space. */ LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next, scrblktmp) { LIST_REMOVE(scrblk, ftsb_next); free(scrblk, M_SOLARIS); } LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) { LIST_REMOVE(scrspc, ftss_next); free(scrspc, M_SOLARIS); } LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) { LIST_REMOVE(scrspc, ftss_next); free(scrspc, M_SOLARIS); } if ((p = pfind(pid)) != NULL) { FOREACH_THREAD_IN_PROC(p, td) td->t_dtrace_sscr = NULL; PROC_UNLOCK(p); } #endif mutex_exit(&proc->ftpc_mtx); /* * There should definitely be no live providers associated with this * process at this point. */ ASSERT(proc->ftpc_acount == 0); bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; mutex_enter(&bucket->ftb_mtx); fprcp = (fasttrap_proc_t **)&bucket->ftb_data; while ((fprc = *fprcp) != NULL) { if (fprc == proc) break; fprcp = &fprc->ftpc_next; } /* * Something strange has happened if we can't find the proc. */ ASSERT(fprc != NULL); *fprcp = fprc->ftpc_next; mutex_exit(&bucket->ftb_mtx); kmem_free(fprc, sizeof (fasttrap_proc_t)); } /* * Lookup a fasttrap-managed provider based on its name and associated pid. * If the pattr argument is non-NULL, this function instantiates the provider * if it doesn't exist otherwise it returns NULL. The provider is returned * with its lock held. */ static fasttrap_provider_t * fasttrap_provider_lookup(pid_t pid, const char *name, const dtrace_pattr_t *pattr) { fasttrap_provider_t *fp, *new_fp = NULL; fasttrap_bucket_t *bucket; char provname[DTRACE_PROVNAMELEN]; proc_t *p; cred_t *cred; ASSERT(strlen(name) < sizeof (fp->ftp_name)); ASSERT(pattr != NULL); bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)]; mutex_enter(&bucket->ftb_mtx); /* * Take a lap through the list and return the match if we find it. */ for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) { if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 && !fp->ftp_retired) { mutex_enter(&fp->ftp_mtx); mutex_exit(&bucket->ftb_mtx); return (fp); } } /* * Drop the bucket lock so we don't try to perform a sleeping * allocation under it. */ mutex_exit(&bucket->ftb_mtx); /* * Make sure the process exists, isn't a child created as the result * of a vfork(2), and isn't a zombie (but may be in fork). */ if ((p = pfind(pid)) == NULL) return (NULL); /* * Increment p_dtrace_probes so that the process knows to inform us * when it exits or execs. fasttrap_provider_free() decrements this * when we're done with this provider. */ p->p_dtrace_probes++; /* * Grab the credentials for this process so we have * something to pass to dtrace_register(). */ PROC_LOCK_ASSERT(p, MA_OWNED); crhold(p->p_ucred); cred = p->p_ucred; PROC_UNLOCK(p); new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP); new_fp->ftp_pid = pid; new_fp->ftp_proc = fasttrap_proc_lookup(pid); #ifndef illumos mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL); mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL); #endif ASSERT(new_fp->ftp_proc != NULL); mutex_enter(&bucket->ftb_mtx); /* * Take another lap through the list to make sure a provider hasn't * been created for this pid while we weren't under the bucket lock. */ for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) { if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 && !fp->ftp_retired) { mutex_enter(&fp->ftp_mtx); mutex_exit(&bucket->ftb_mtx); fasttrap_provider_free(new_fp); crfree(cred); return (fp); } } (void) strcpy(new_fp->ftp_name, name); /* * Fail and return NULL if either the provider name is too long * or we fail to register this new provider with the DTrace * framework. Note that this is the only place we ever construct * the full provider name -- we keep it in pieces in the provider * structure. */ if (snprintf(provname, sizeof (provname), "%s%u", name, (uint_t)pid) >= sizeof (provname) || dtrace_register(provname, pattr, DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER, cred, pattr == &pid_attr ? &pid_pops : &usdt_pops, new_fp, &new_fp->ftp_provid) != 0) { mutex_exit(&bucket->ftb_mtx); fasttrap_provider_free(new_fp); crfree(cred); return (NULL); } new_fp->ftp_next = bucket->ftb_data; bucket->ftb_data = new_fp; mutex_enter(&new_fp->ftp_mtx); mutex_exit(&bucket->ftb_mtx); crfree(cred); return (new_fp); } static void fasttrap_provider_free(fasttrap_provider_t *provider) { pid_t pid = provider->ftp_pid; proc_t *p; /* * There need to be no associated enabled probes, no consumers * creating probes, and no meta providers referencing this provider. */ ASSERT(provider->ftp_rcount == 0); ASSERT(provider->ftp_ccount == 0); ASSERT(provider->ftp_mcount == 0); /* * If this provider hasn't been retired, we need to explicitly drop the * count of active providers on the associated process structure. */ if (!provider->ftp_retired) { atomic_dec_64(&provider->ftp_proc->ftpc_acount); ASSERT(provider->ftp_proc->ftpc_acount < provider->ftp_proc->ftpc_rcount); } fasttrap_proc_release(provider->ftp_proc); #ifndef illumos mutex_destroy(&provider->ftp_mtx); mutex_destroy(&provider->ftp_cmtx); #endif kmem_free(provider, sizeof (fasttrap_provider_t)); /* * Decrement p_dtrace_probes on the process whose provider we're * freeing. We don't have to worry about clobbering somone else's * modifications to it because we have locked the bucket that * corresponds to this process's hash chain in the provider hash * table. Don't sweat it if we can't find the process. */ if ((p = pfind(pid)) == NULL) { return; } p->p_dtrace_probes--; #ifndef illumos PROC_UNLOCK(p); #endif } static void fasttrap_provider_retire(pid_t pid, const char *name, int mprov) { fasttrap_provider_t *fp; fasttrap_bucket_t *bucket; dtrace_provider_id_t provid; ASSERT(strlen(name) < sizeof (fp->ftp_name)); bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)]; mutex_enter(&bucket->ftb_mtx); for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) { if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 && !fp->ftp_retired) break; } if (fp == NULL) { mutex_exit(&bucket->ftb_mtx); return; } mutex_enter(&fp->ftp_mtx); ASSERT(!mprov || fp->ftp_mcount > 0); if (mprov && --fp->ftp_mcount != 0) { mutex_exit(&fp->ftp_mtx); mutex_exit(&bucket->ftb_mtx); return; } /* * Mark the provider to be removed in our post-processing step, mark it * retired, and drop the active count on its proc. Marking it indicates * that we should try to remove it; setting the retired flag indicates * that we're done with this provider; dropping the active the proc * releases our hold, and when this reaches zero (as it will during * exit or exec) the proc and associated providers become defunct. * * We obviously need to take the bucket lock before the provider lock * to perform the lookup, but we need to drop the provider lock * before calling into the DTrace framework since we acquire the * provider lock in callbacks invoked from the DTrace framework. The * bucket lock therefore protects the integrity of the provider hash * table. */ atomic_dec_64(&fp->ftp_proc->ftpc_acount); ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount); fp->ftp_retired = 1; fp->ftp_marked = 1; provid = fp->ftp_provid; mutex_exit(&fp->ftp_mtx); /* * We don't have to worry about invalidating the same provider twice * since fasttrap_provider_lookup() will ignore provider that have * been marked as retired. */ dtrace_invalidate(provid); mutex_exit(&bucket->ftb_mtx); fasttrap_pid_cleanup(); } static int fasttrap_uint32_cmp(const void *ap, const void *bp) { return (*(const uint32_t *)ap - *(const uint32_t *)bp); } static int fasttrap_uint64_cmp(const void *ap, const void *bp) { return (*(const uint64_t *)ap - *(const uint64_t *)bp); } static int fasttrap_add_probe(fasttrap_probe_spec_t *pdata) { fasttrap_provider_t *provider; fasttrap_probe_t *pp; fasttrap_tracepoint_t *tp; char *name; int i, aframes = 0, whack; /* * There needs to be at least one desired trace point. */ if (pdata->ftps_noffs == 0) return (EINVAL); switch (pdata->ftps_type) { case DTFTP_ENTRY: name = "entry"; aframes = FASTTRAP_ENTRY_AFRAMES; break; case DTFTP_RETURN: name = "return"; aframes = FASTTRAP_RETURN_AFRAMES; break; case DTFTP_OFFSETS: name = NULL; break; default: return (EINVAL); } if ((provider = fasttrap_provider_lookup(pdata->ftps_pid, FASTTRAP_PID_NAME, &pid_attr)) == NULL) return (ESRCH); /* * Increment this reference count to indicate that a consumer is * actively adding a new probe associated with this provider. This * prevents the provider from being deleted -- we'll need to check * for pending deletions when we drop this reference count. */ provider->ftp_ccount++; mutex_exit(&provider->ftp_mtx); /* * Grab the creation lock to ensure consistency between calls to * dtrace_probe_lookup() and dtrace_probe_create() in the face of * other threads creating probes. We must drop the provider lock * before taking this lock to avoid a three-way deadlock with the * DTrace framework. */ mutex_enter(&provider->ftp_cmtx); if (name == NULL) { for (i = 0; i < pdata->ftps_noffs; i++) { char name_str[17]; (void) sprintf(name_str, "%llx", (unsigned long long)pdata->ftps_offs[i]); if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod, pdata->ftps_func, name_str) != 0) continue; atomic_inc_32(&fasttrap_total); if (fasttrap_total > fasttrap_max) { atomic_dec_32(&fasttrap_total); goto no_mem; } pp = kmem_zalloc(sizeof (fasttrap_probe_t), KM_SLEEP); pp->ftp_prov = provider; pp->ftp_faddr = pdata->ftps_pc; pp->ftp_fsize = pdata->ftps_size; pp->ftp_pid = pdata->ftps_pid; pp->ftp_ntps = 1; tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP); tp->ftt_proc = provider->ftp_proc; tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc; tp->ftt_pid = pdata->ftps_pid; pp->ftp_tps[0].fit_tp = tp; pp->ftp_tps[0].fit_id.fti_probe = pp; pp->ftp_tps[0].fit_id.fti_ptype = pdata->ftps_type; pp->ftp_id = dtrace_probe_create(provider->ftp_provid, pdata->ftps_mod, pdata->ftps_func, name_str, FASTTRAP_OFFSET_AFRAMES, pp); } } else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod, pdata->ftps_func, name) == 0) { atomic_add_32(&fasttrap_total, pdata->ftps_noffs); if (fasttrap_total > fasttrap_max) { atomic_add_32(&fasttrap_total, -pdata->ftps_noffs); goto no_mem; } /* * Make sure all tracepoint program counter values are unique. * We later assume that each probe has exactly one tracepoint * for a given pc. */ qsort(pdata->ftps_offs, pdata->ftps_noffs, sizeof (uint64_t), fasttrap_uint64_cmp); for (i = 1; i < pdata->ftps_noffs; i++) { if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1]) continue; atomic_add_32(&fasttrap_total, -pdata->ftps_noffs); goto no_mem; } ASSERT(pdata->ftps_noffs > 0); pp = kmem_zalloc(offsetof(fasttrap_probe_t, ftp_tps[pdata->ftps_noffs]), KM_SLEEP); pp->ftp_prov = provider; pp->ftp_faddr = pdata->ftps_pc; pp->ftp_fsize = pdata->ftps_size; pp->ftp_pid = pdata->ftps_pid; pp->ftp_ntps = pdata->ftps_noffs; for (i = 0; i < pdata->ftps_noffs; i++) { tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP); tp->ftt_proc = provider->ftp_proc; tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc; tp->ftt_pid = pdata->ftps_pid; pp->ftp_tps[i].fit_tp = tp; pp->ftp_tps[i].fit_id.fti_probe = pp; pp->ftp_tps[i].fit_id.fti_ptype = pdata->ftps_type; } pp->ftp_id = dtrace_probe_create(provider->ftp_provid, pdata->ftps_mod, pdata->ftps_func, name, aframes, pp); } mutex_exit(&provider->ftp_cmtx); /* * We know that the provider is still valid since we incremented the * creation reference count. If someone tried to clean up this provider * while we were using it (e.g. because the process called exec(2) or * exit(2)), take note of that and try to clean it up now. */ mutex_enter(&provider->ftp_mtx); provider->ftp_ccount--; whack = provider->ftp_retired; mutex_exit(&provider->ftp_mtx); if (whack) fasttrap_pid_cleanup(); return (0); no_mem: /* * If we've exhausted the allowable resources, we'll try to remove * this provider to free some up. This is to cover the case where * the user has accidentally created many more probes than was * intended (e.g. pid123:::). */ mutex_exit(&provider->ftp_cmtx); mutex_enter(&provider->ftp_mtx); provider->ftp_ccount--; provider->ftp_marked = 1; mutex_exit(&provider->ftp_mtx); fasttrap_pid_cleanup(); return (ENOMEM); } /*ARGSUSED*/ static void * fasttrap_meta_provide(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid) { fasttrap_provider_t *provider; /* * A 32-bit unsigned integer (like a pid for example) can be * expressed in 10 or fewer decimal digits. Make sure that we'll * have enough space for the provider name. */ if (strlen(dhpv->dthpv_provname) + 10 >= sizeof (provider->ftp_name)) { printf("failed to instantiate provider %s: " "name too long to accomodate pid", dhpv->dthpv_provname); return (NULL); } /* * Don't let folks spoof the true pid provider. */ if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) { printf("failed to instantiate provider %s: " "%s is an invalid name", dhpv->dthpv_provname, FASTTRAP_PID_NAME); return (NULL); } /* * The highest stability class that fasttrap supports is ISA; cap * the stability of the new provider accordingly. */ if (dhpv->dthpv_pattr.dtpa_provider.dtat_class > DTRACE_CLASS_ISA) dhpv->dthpv_pattr.dtpa_provider.dtat_class = DTRACE_CLASS_ISA; if (dhpv->dthpv_pattr.dtpa_mod.dtat_class > DTRACE_CLASS_ISA) dhpv->dthpv_pattr.dtpa_mod.dtat_class = DTRACE_CLASS_ISA; if (dhpv->dthpv_pattr.dtpa_func.dtat_class > DTRACE_CLASS_ISA) dhpv->dthpv_pattr.dtpa_func.dtat_class = DTRACE_CLASS_ISA; if (dhpv->dthpv_pattr.dtpa_name.dtat_class > DTRACE_CLASS_ISA) dhpv->dthpv_pattr.dtpa_name.dtat_class = DTRACE_CLASS_ISA; if (dhpv->dthpv_pattr.dtpa_args.dtat_class > DTRACE_CLASS_ISA) dhpv->dthpv_pattr.dtpa_args.dtat_class = DTRACE_CLASS_ISA; if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname, &dhpv->dthpv_pattr)) == NULL) { printf("failed to instantiate provider %s for " "process %u", dhpv->dthpv_provname, (uint_t)pid); return (NULL); } /* * Up the meta provider count so this provider isn't removed until * the meta provider has been told to remove it. */ provider->ftp_mcount++; mutex_exit(&provider->ftp_mtx); return (provider); } /* * We know a few things about our context here: we know that the probe being * created doesn't already exist (DTrace won't load DOF at the same address * twice, even if explicitly told to do so) and we know that we are * single-threaded with respect to the meta provider machinery. Knowing that * this is a new probe and that there is no way for us to race with another * operation on this provider allows us an important optimization: we need not * lookup a probe before adding it. Saving this lookup is important because * this code is in the fork path for processes with USDT probes, and lookups * here are potentially very expensive because of long hash conflicts on * module, function and name (DTrace doesn't hash on provider name). */ /*ARGSUSED*/ static void fasttrap_meta_create_probe(void *arg, void *parg, dtrace_helper_probedesc_t *dhpb) { fasttrap_provider_t *provider = parg; fasttrap_probe_t *pp; fasttrap_tracepoint_t *tp; int i, j; uint32_t ntps; /* * Since the meta provider count is non-zero we don't have to worry * about this provider disappearing. */ ASSERT(provider->ftp_mcount > 0); /* * The offsets must be unique. */ qsort(dhpb->dthpb_offs, dhpb->dthpb_noffs, sizeof (uint32_t), fasttrap_uint32_cmp); for (i = 1; i < dhpb->dthpb_noffs; i++) { if (dhpb->dthpb_base + dhpb->dthpb_offs[i] <= dhpb->dthpb_base + dhpb->dthpb_offs[i - 1]) return; } qsort(dhpb->dthpb_enoffs, dhpb->dthpb_nenoffs, sizeof (uint32_t), fasttrap_uint32_cmp); for (i = 1; i < dhpb->dthpb_nenoffs; i++) { if (dhpb->dthpb_base + dhpb->dthpb_enoffs[i] <= dhpb->dthpb_base + dhpb->dthpb_enoffs[i - 1]) return; } ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs; ASSERT(ntps > 0); atomic_add_32(&fasttrap_total, ntps); if (fasttrap_total > fasttrap_max) { atomic_add_32(&fasttrap_total, -ntps); return; } pp = kmem_zalloc(offsetof(fasttrap_probe_t, ftp_tps[ntps]), KM_SLEEP); pp->ftp_prov = provider; pp->ftp_pid = provider->ftp_pid; pp->ftp_ntps = ntps; pp->ftp_nargs = dhpb->dthpb_xargc; pp->ftp_xtypes = dhpb->dthpb_xtypes; pp->ftp_ntypes = dhpb->dthpb_ntypes; /* * First create a tracepoint for each actual point of interest. */ for (i = 0; i < dhpb->dthpb_noffs; i++) { tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP); tp->ftt_proc = provider->ftp_proc; tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_offs[i]; tp->ftt_pid = provider->ftp_pid; pp->ftp_tps[i].fit_tp = tp; pp->ftp_tps[i].fit_id.fti_probe = pp; #ifdef __sparc pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_POST_OFFSETS; #else pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_OFFSETS; #endif } /* * Then create a tracepoint for each is-enabled point. */ for (j = 0; i < ntps; i++, j++) { tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP); tp->ftt_proc = provider->ftp_proc; tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_enoffs[j]; tp->ftt_pid = provider->ftp_pid; pp->ftp_tps[i].fit_tp = tp; pp->ftp_tps[i].fit_id.fti_probe = pp; pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_IS_ENABLED; } /* * If the arguments are shuffled around we set the argument remapping * table. Later, when the probe fires, we only remap the arguments * if the table is non-NULL. */ for (i = 0; i < dhpb->dthpb_xargc; i++) { if (dhpb->dthpb_args[i] != i) { pp->ftp_argmap = dhpb->dthpb_args; break; } } /* * The probe is fully constructed -- register it with DTrace. */ pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod, dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp); } /*ARGSUSED*/ static void fasttrap_meta_remove(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid) { /* * Clean up the USDT provider. There may be active consumers of the * provider busy adding probes, no damage will actually befall the * provider until that count has dropped to zero. This just puts * the provider on death row. */ fasttrap_provider_retire(pid, dhpv->dthpv_provname, 1); } static dtrace_mops_t fasttrap_mops = { .dtms_create_probe = fasttrap_meta_create_probe, .dtms_provide_pid = fasttrap_meta_provide, .dtms_remove_pid = fasttrap_meta_remove }; /*ARGSUSED*/ static int fasttrap_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused) { return (0); } /*ARGSUSED*/ static int fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag, struct thread *td) { if (!dtrace_attached()) return (EAGAIN); if (cmd == FASTTRAPIOC_MAKEPROBE) { fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg; fasttrap_probe_spec_t *probe; uint64_t noffs; size_t size; int ret, err; if (copyin(&uprobe->ftps_noffs, &noffs, sizeof (uprobe->ftps_noffs))) return (EFAULT); /* * Probes must have at least one tracepoint. */ if (noffs == 0) return (EINVAL); size = sizeof (fasttrap_probe_spec_t) + sizeof (probe->ftps_offs[0]) * (noffs - 1); if (size > 1024 * 1024) return (ENOMEM); probe = kmem_alloc(size, KM_SLEEP); if (copyin(uprobe, probe, size) != 0 || probe->ftps_noffs != noffs) { kmem_free(probe, size); return (EFAULT); } /* * Verify that the function and module strings contain no * funny characters. */ if (u8_validate(probe->ftps_func, strlen(probe->ftps_func), NULL, U8_VALIDATE_ENTIRE, &err) < 0) { ret = EINVAL; goto err; } if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod), NULL, U8_VALIDATE_ENTIRE, &err) < 0) { ret = EINVAL; goto err; } #ifdef notyet if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) { proc_t *p; pid_t pid = probe->ftps_pid; mutex_enter(&pidlock); /* * Report an error if the process doesn't exist * or is actively being birthed. */ if ((p = pfind(pid)) == NULL || p->p_stat == SIDL) { mutex_exit(&pidlock); return (ESRCH); } mutex_enter(&p->p_lock); mutex_exit(&pidlock); if ((ret = priv_proc_cred_perm(cr, p, NULL, VREAD | VWRITE)) != 0) { mutex_exit(&p->p_lock); return (ret); } mutex_exit(&p->p_lock); } #endif /* notyet */ ret = fasttrap_add_probe(probe); err: kmem_free(probe, size); return (ret); } else if (cmd == FASTTRAPIOC_GETINSTR) { fasttrap_instr_query_t instr; fasttrap_tracepoint_t *tp; uint_t index; #ifdef notyet int ret; #endif #ifdef illumos if (copyin((void *)arg, &instr, sizeof (instr)) != 0) return (EFAULT); #endif #ifdef notyet if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) { proc_t *p; pid_t pid = instr.ftiq_pid; mutex_enter(&pidlock); /* * Report an error if the process doesn't exist * or is actively being birthed. */ if ((p == pfind(pid)) == NULL || p->p_stat == SIDL) { mutex_exit(&pidlock); return (ESRCH); } mutex_enter(&p->p_lock); mutex_exit(&pidlock); if ((ret = priv_proc_cred_perm(cr, p, NULL, VREAD)) != 0) { mutex_exit(&p->p_lock); return (ret); } mutex_exit(&p->p_lock); } #endif /* notyet */ index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc); mutex_enter(&fasttrap_tpoints.fth_table[index].ftb_mtx); tp = fasttrap_tpoints.fth_table[index].ftb_data; while (tp != NULL) { if (instr.ftiq_pid == tp->ftt_pid && instr.ftiq_pc == tp->ftt_pc && tp->ftt_proc->ftpc_acount != 0) break; tp = tp->ftt_next; } if (tp == NULL) { mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx); return (ENOENT); } bcopy(&tp->ftt_instr, &instr.ftiq_instr, sizeof (instr.ftiq_instr)); mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx); if (copyout(&instr, (void *)arg, sizeof (instr)) != 0) return (EFAULT); return (0); } return (EINVAL); } static int fasttrap_load(void) { ulong_t nent; int i, ret; /* Create the /dev/dtrace/fasttrap entry. */ fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "dtrace/fasttrap"); mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF); mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT, NULL); #ifdef illumos fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT); #endif fasttrap_total = 0; /* * Conjure up the tracepoints hashtable... */ #ifdef illumos nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE); #else nent = tpoints_hash_size; #endif if (nent == 0 || nent > 0x1000000) nent = FASTTRAP_TPOINTS_DEFAULT_SIZE; tpoints_hash_size = nent; if (ISP2(nent)) fasttrap_tpoints.fth_nent = nent; else fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent); ASSERT(fasttrap_tpoints.fth_nent > 0); fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1; fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t), KM_SLEEP); #ifndef illumos for (i = 0; i < fasttrap_tpoints.fth_nent; i++) mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, "tracepoints bucket mtx", MUTEX_DEFAULT, NULL); #endif /* * ... and the providers hash table... */ nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE; if (ISP2(nent)) fasttrap_provs.fth_nent = nent; else fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent); ASSERT(fasttrap_provs.fth_nent > 0); fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1; fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t), KM_SLEEP); #ifndef illumos for (i = 0; i < fasttrap_provs.fth_nent; i++) mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, "providers bucket mtx", MUTEX_DEFAULT, NULL); #endif ret = kproc_create(fasttrap_pid_cleanup_cb, NULL, &fasttrap_cleanup_proc, 0, 0, "ftcleanup"); if (ret != 0) { destroy_dev(fasttrap_cdev); #ifndef illumos for (i = 0; i < fasttrap_provs.fth_nent; i++) mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx); for (i = 0; i < fasttrap_tpoints.fth_nent; i++) mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx); #endif kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t)); mtx_destroy(&fasttrap_cleanup_mtx); mutex_destroy(&fasttrap_count_mtx); return (ret); } /* * ... and the procs hash table. */ nent = FASTTRAP_PROCS_DEFAULT_SIZE; if (ISP2(nent)) fasttrap_procs.fth_nent = nent; else fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent); ASSERT(fasttrap_procs.fth_nent > 0); fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1; fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t), KM_SLEEP); #ifndef illumos for (i = 0; i < fasttrap_procs.fth_nent; i++) mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx, "processes bucket mtx", MUTEX_DEFAULT, NULL); rm_init(&fasttrap_tp_lock, "fasttrap tracepoint"); /* * This event handler must run before kdtrace_thread_dtor() since it * accesses the thread's struct kdtrace_thread. */ fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST); #endif /* * Install our hooks into fork(2), exec(2), and exit(2). */ dtrace_fasttrap_fork = &fasttrap_fork; dtrace_fasttrap_exit = &fasttrap_exec_exit; dtrace_fasttrap_exec = &fasttrap_exec_exit; (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL, &fasttrap_meta_id); return (0); } static int fasttrap_unload(void) { int i, fail = 0; /* * Unregister the meta-provider to make sure no new fasttrap- * managed providers come along while we're trying to close up * shop. If we fail to detach, we'll need to re-register as a * meta-provider. We can fail to unregister as a meta-provider * if providers we manage still exist. */ if (fasttrap_meta_id != DTRACE_METAPROVNONE && dtrace_meta_unregister(fasttrap_meta_id) != 0) return (-1); /* * Iterate over all of our providers. If there's still a process * that corresponds to that pid, fail to detach. */ for (i = 0; i < fasttrap_provs.fth_nent; i++) { fasttrap_provider_t **fpp, *fp; fasttrap_bucket_t *bucket = &fasttrap_provs.fth_table[i]; mutex_enter(&bucket->ftb_mtx); fpp = (fasttrap_provider_t **)&bucket->ftb_data; while ((fp = *fpp) != NULL) { /* * Acquire and release the lock as a simple way of * waiting for any other consumer to finish with * this provider. A thread must first acquire the * bucket lock so there's no chance of another thread * blocking on the provider's lock. */ mutex_enter(&fp->ftp_mtx); mutex_exit(&fp->ftp_mtx); if (dtrace_unregister(fp->ftp_provid) != 0) { fail = 1; fpp = &fp->ftp_next; } else { *fpp = fp->ftp_next; fasttrap_provider_free(fp); } } mutex_exit(&bucket->ftb_mtx); } if (fail) { (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL, &fasttrap_meta_id); return (-1); } /* * Stop new processes from entering these hooks now, before the * fasttrap_cleanup thread runs. That way all processes will hopefully * be out of these hooks before we free fasttrap_provs.fth_table */ ASSERT(dtrace_fasttrap_fork == &fasttrap_fork); dtrace_fasttrap_fork = NULL; ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit); dtrace_fasttrap_exec = NULL; ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit); dtrace_fasttrap_exit = NULL; mtx_lock(&fasttrap_cleanup_mtx); fasttrap_cleanup_drain = 1; /* Wait for the cleanup thread to finish up and signal us. */ wakeup(&fasttrap_cleanup_cv); mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld", 0); fasttrap_cleanup_proc = NULL; mtx_destroy(&fasttrap_cleanup_mtx); #ifdef DEBUG mutex_enter(&fasttrap_count_mtx); ASSERT(fasttrap_pid_count == 0); mutex_exit(&fasttrap_count_mtx); #endif #ifndef illumos EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag); for (i = 0; i < fasttrap_tpoints.fth_nent; i++) mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx); for (i = 0; i < fasttrap_provs.fth_nent; i++) mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx); for (i = 0; i < fasttrap_procs.fth_nent; i++) mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx); #endif kmem_free(fasttrap_tpoints.fth_table, fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t)); fasttrap_tpoints.fth_nent = 0; kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t)); fasttrap_provs.fth_nent = 0; kmem_free(fasttrap_procs.fth_table, fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t)); fasttrap_procs.fth_nent = 0; #ifndef illumos destroy_dev(fasttrap_cdev); mutex_destroy(&fasttrap_count_mtx); rm_destroy(&fasttrap_tp_lock); #endif return (0); } /* ARGSUSED */ static int fasttrap_modevent(module_t mod __unused, int type, void *data __unused) { int error = 0; switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load, NULL); SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_unload, NULL); DEV_MODULE(fasttrap, fasttrap_modevent, NULL); MODULE_VERSION(fasttrap, 1); MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1); MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1); Index: projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris =================================================================== --- projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris (revision 326161) +++ projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris (revision 326162) Property changes on: projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/cddl/contrib/opensolaris:r326132-326161 Index: projects/bsd_rdma_4_9/sys/compat/linux/linux_emul.c =================================================================== --- projects/bsd_rdma_4_9/sys/compat/linux/linux_emul.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/compat/linux/linux_emul.c (revision 326162) @@ -1,300 +1,300 @@ /*- * Copyright (c) 2006 Roman Divacky * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This returns reference to the thread emuldata entry (if found) * * Hold PROC_LOCK when referencing emuldata from other threads. */ struct linux_emuldata * em_find(struct thread *td) { struct linux_emuldata *em; em = td->td_emuldata; return (em); } /* * This returns reference to the proc pemuldata entry (if found) * * Hold PROC_LOCK when referencing proc pemuldata from other threads. * Hold LINUX_PEM_LOCK wher referencing pemuldata members. */ struct linux_pemuldata * pem_find(struct proc *p) { struct linux_pemuldata *pem; pem = p->p_emuldata; return (pem); } void linux_proc_init(struct thread *td, struct thread *newtd, int flags) { struct linux_emuldata *em; struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct proc *p; if (newtd != NULL) { p = newtd->td_proc; /* non-exec call */ em = malloc(sizeof(*em), M_TEMP, M_WAITOK | M_ZERO); if (flags & LINUX_CLONE_THREAD) { LINUX_CTR1(proc_init, "thread newtd(%d)", newtd->td_tid); em->em_tid = newtd->td_tid; } else { LINUX_CTR1(proc_init, "fork newtd(%d)", p->p_pid); em->em_tid = p->p_pid; pem = malloc(sizeof(*pem), M_LINUX, M_WAITOK | M_ZERO); sx_init(&pem->pem_sx, "lpemlk"); p->p_emuldata = pem; } newtd->td_emuldata = em; } else { p = td->td_proc; /* exec */ LINUX_CTR1(proc_init, "exec newtd(%d)", p->p_pid); /* lookup the old one */ em = em_find(td); KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n")); em->em_tid = p->p_pid; em->flags = 0; em->pdeath_signal = 0; em->robust_futexes = NULL; em->child_clear_tid = NULL; em->child_set_tid = NULL; /* epoll should be destroyed in a case of exec. */ pem = pem_find(p); KASSERT(pem != NULL, ("proc_exit: proc emuldata not found.\n")); pem->persona = 0; if (pem->epoll != NULL) { emd = pem->epoll; pem->epoll = NULL; free(emd, M_EPOLL); } } } void linux_proc_exit(void *arg __unused, struct proc *p) { struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct thread *td = curthread; if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX)) return; LINUX_CTR3(proc_exit, "thread(%d) proc(%d) p %p", td->td_tid, p->p_pid, p); pem = pem_find(p); if (pem == NULL) return; (p->p_sysent->sv_thread_detach)(td); p->p_emuldata = NULL; if (pem->epoll != NULL) { emd = pem->epoll; pem->epoll = NULL; free(emd, M_EPOLL); } sx_destroy(&pem->pem_sx); free(pem, M_LINUX); } int linux_common_execve(struct thread *td, struct image_args *eargs) { struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct vmspace *oldvmspace; struct linux_emuldata *em; struct proc *p; int error; p = td->td_proc; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = kern_execve(td, eargs, NULL); post_execve(td, error, oldvmspace); - if (error != 0) + if (error != EJUSTRETURN) return (error); /* * In a case of transition from Linux binary execing to * FreeBSD binary we destroy linux emuldata thread & proc entries. */ if (SV_CURPROC_ABI() != SV_ABI_LINUX) { PROC_LOCK(p); em = em_find(td); KASSERT(em != NULL, ("proc_exec: thread emuldata not found.\n")); td->td_emuldata = NULL; pem = pem_find(p); KASSERT(pem != NULL, ("proc_exec: proc pemuldata not found.\n")); p->p_emuldata = NULL; PROC_UNLOCK(p); if (pem->epoll != NULL) { emd = pem->epoll; pem->epoll = NULL; free(emd, M_EPOLL); } free(em, M_TEMP); free(pem, M_LINUX); } - return (0); + return (EJUSTRETURN); } void linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp) { struct thread *td = curthread; struct thread *othertd; #if defined(__amd64__) struct linux_pemuldata *pem; #endif /* * In a case of execing from linux binary properly detach * other threads from the user space. */ if (__predict_false(SV_PROC_ABI(p) == SV_ABI_LINUX)) { FOREACH_THREAD_IN_PROC(p, othertd) { if (td != othertd) (p->p_sysent->sv_thread_detach)(othertd); } } /* * In a case of execing to linux binary we create linux * emuldata thread entry. */ if (__predict_false((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX)) { if (SV_PROC_ABI(p) == SV_ABI_LINUX) linux_proc_init(td, NULL, 0); else linux_proc_init(td, td, 0); #if defined(__amd64__) /* * An IA32 executable which has executable stack will have the * READ_IMPLIES_EXEC personality flag set automatically. */ if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && imgp->stack_prot & VM_PROT_EXECUTE) { pem = pem_find(p); pem->persona |= LINUX_READ_IMPLIES_EXEC; } #endif } } void linux_thread_dtor(void *arg __unused, struct thread *td) { struct linux_emuldata *em; em = em_find(td); if (em == NULL) return; td->td_emuldata = NULL; LINUX_CTR1(thread_dtor, "thread(%d)", em->em_tid); free(em, M_TEMP); } void linux_schedtail(struct thread *td) { struct linux_emuldata *em; struct proc *p; int error = 0; int *child_set_tid; p = td->td_proc; em = em_find(td); KASSERT(em != NULL, ("linux_schedtail: thread emuldata not found.\n")); child_set_tid = em->child_set_tid; if (child_set_tid != NULL) { error = copyout(&em->em_tid, child_set_tid, sizeof(em->em_tid)); LINUX_CTR4(schedtail, "thread(%d) %p stored %d error %d", td->td_tid, child_set_tid, em->em_tid, error); } else LINUX_CTR1(schedtail, "thread(%d)", em->em_tid); } Index: projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.c =================================================================== --- projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.c (revision 326162) @@ -1,3437 +1,3437 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); -u_int g_mirror_debug = 0; -SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, +int g_mirror_debug = 0; +SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static g_ctl_destroy_geom_t g_mirror_destroy_geom; static g_taste_t g_mirror_taste; static g_init_t g_mirror_init; static g_fini_t g_mirror_fini; static g_provgone_t g_mirror_providergone; static g_resize_t g_mirror_resize; struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .providergone = g_mirror_providergone, .resize = g_mirror_resize }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, bool force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_get(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; sx_assert(&sc->sc_lock, SX_LOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_REMOVE(disk, d_next); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_free_device(struct g_mirror_softc *sc) { mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_destroy(&sc->sc_lock); free(sc, M_MIRROR); } static void g_mirror_providergone(struct g_provider *pp) { struct g_mirror_softc *sc = pp->private; if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_get(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); sx_xunlock(&sc->sc_lock); if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); g_topology_unlock(); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error); if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return (0); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_flush_done(struct bio *bp) { struct g_mirror_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->private; mtx_lock(&sc->sc_done_mtx); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_done_mtx); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_done_mtx); g_destroy_bio(bp); } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = bp->bio_parent; sc = pbp->bio_to->private; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } if (bp->bio_cmd == BIO_READ) KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read, bp->bio_error); else if (bp->bio_cmd == BIO_WRITE) KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write, bp->bio_error); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { if (bp->bio_error == ENXIO && bp->bio_cmd == BIO_READ) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; else if (bp->bio_error == ENXIO) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW; else sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int *val; sc = bp->bio_to->private; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = (int *)bp->bio_data; *val = (disk != NULL); g_io_deliver(bp, 0); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the highest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->private; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_flush_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_mirror_flush(sc, bp); return; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); if (bp->bio_to->error != 0) { mtx_unlock(&sc->sc_queue_mtx); g_io_deliver(bp, bp->bio_to->error); return; } bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; u_int i; if (sc->sc_sync.ds_ndisks == 0) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (1); } } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (0); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_mirror_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Free a synchronization request and clear its slot in the array. */ static void g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp) { int idx; if (disk != NULL && disk->d_sync.ds_bios != NULL) { idx = (int)(uintptr_t)bp->bio_caller1; KASSERT(disk->d_sync.ds_bios[idx] == bp, ("unexpected sync BIO at %p:%d", disk, idx)); disk->d_sync.ds_bios[idx] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_mirror_sync_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_mirror_sync_request_free(NULL, bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { off_t offset; void *data; int i, idx; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; g_mirror_sync_request_free(disk, bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; idx = (int)(uintptr_t)bp->bio_caller1; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)idx; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp != NULL && bp->bio_offset < offset) offset = bp->bio_offset; } if (sync->ds_offset_done + (MAXPHYS * 100) < offset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = offset; g_mirror_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { sync = &disk->d_sync; switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= sync->ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (bioq_first(&queue) == NULL) { g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, bp); return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_get(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, true); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, false); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_takefirst(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); if (bioq_first(&sc->sc_queue) != NULL) { mtx_unlock(&sc->sc_queue_mtx); continue; } } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { g_mirror_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) g_mirror_regular_request(bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) g_mirror_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { g_mirror_register_request(bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[i] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)i; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = disk->d_sync.ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } g_mirror_regular_release(sc); free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } pp->private = sc; sc->sc_refcnt++; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { /* * Abort any pending I/O that wasn't generated by us. * Synchronization requests and requests destined for individual * mirror components can be destroyed immediately. */ if (bp->bio_to == sc->sc_provider && bp->bio_from->geom != sc->sc_sync.ds_geom) { g_io_deliver(bp, ENXIO); } else { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } } mtx_unlock(&sc->sc_queue_mtx); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name); g_topology_unlock(); } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 && (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, bool force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; u_int dirty, ndisks, genid, syncid; bool broken; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * if we have any disks and 'force' is true. */ ndisks = g_mirror_ndisks(sc, -1); if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Find the biggest genid. */ genid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ broken = false; LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_genid < genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", g_mirror_get_diskname(disk), sc->sc_name); g_mirror_destroy_disk(disk); /* * Bump the syncid in case we discover a healthy * replacement disk after starting the mirror. */ broken = true; } } /* * Find the biggest syncid. */ syncid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid > syncid) syncid = disk->d_sync.ds_syncid; } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; sc->sc_syncid = syncid; if (force || broken) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No usable disks, so destroy the device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW; g_mirror_bump_syncid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) { G_MIRROR_DEBUG(0, "Device %s: failed to clear metadata on %s: %d.", sc->sc_name, g_mirror_get_diskname(disk), error); break; } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_slice != sc->sc_slice) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_slice", pp->name, sc->sc_name); return (EINVAL); } if (md->md_balance != sc->sc_balance) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_balance", pp->name, sc->sc_name); return (EINVAL); } #if 0 if (md->md_mediasize != sc->sc_mediasize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } #endif if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0, ("CLOSEWAIT flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } sc->sc_provider_open += acr + acw + ace; if (pp->acw + acw == 0) g_mirror_idle(sc, 0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 && sc->sc_provider_open == 0) g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_type = type; sc->sc_id = md->md_mid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sc->sc_refcnt = 1; sx_init(&sc->sc_lock, "gmirror:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; sc->sc_provider_open = 0; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); g_destroy_geom(sc->sc_geom); g_mirror_free_device(sc); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider_open != 0) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (%d).", sc->sc_name, sc->sc_provider_open); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", sc->sc_name); } } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { sx_xunlock(&sc->sc_lock); return (0); } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_provider->mediasize)); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_MIRROR_TYPE_AUTOMATIC: sbuf_printf(sb, "AUTOMATIC"); break; case G_MIRROR_TYPE_MANUAL: sbuf_printf(sb, "MANUAL"); break; default: sbuf_printf(sb, "UNKNOWN"); break; } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; if (panicstr != NULL) return; mp = arg; g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); Index: projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.h =================================================================== --- projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.h (revision 326161) +++ projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.h (revision 326162) @@ -1,510 +1,510 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_MIRROR_H_ #define _G_MIRROR_H_ #include #include #define G_MIRROR_CLASS_NAME "MIRROR" #define G_MIRROR_MAGIC "GEOM::MIRROR" /* * Version history: * 0 - Initial version number. * 1 - Added 'prefer' balance algorithm. * 2 - Added md_genid field to metadata. * 3 - Added md_provsize field to metadata. * 4 - Added 'no failure synchronization' flag. */ #define G_MIRROR_VERSION 4 #define G_MIRROR_BALANCE_NONE 0 #define G_MIRROR_BALANCE_ROUND_ROBIN 1 #define G_MIRROR_BALANCE_LOAD 2 #define G_MIRROR_BALANCE_SPLIT 3 #define G_MIRROR_BALANCE_PREFER 4 #define G_MIRROR_BALANCE_MIN G_MIRROR_BALANCE_NONE #define G_MIRROR_BALANCE_MAX G_MIRROR_BALANCE_PREFER #define G_MIRROR_DISK_FLAG_DIRTY 0x0000000000000001ULL #define G_MIRROR_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL #define G_MIRROR_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL #define G_MIRROR_DISK_FLAG_INACTIVE 0x0000000000000008ULL #define G_MIRROR_DISK_FLAG_HARDCODED 0x0000000000000010ULL #define G_MIRROR_DISK_FLAG_BROKEN 0x0000000000000020ULL #define G_MIRROR_DISK_FLAG_CANDELETE 0x0000000000000040ULL #define G_MIRROR_DISK_FLAG_MASK (G_MIRROR_DISK_FLAG_DIRTY | \ G_MIRROR_DISK_FLAG_SYNCHRONIZING | \ G_MIRROR_DISK_FLAG_FORCE_SYNC | \ G_MIRROR_DISK_FLAG_INACTIVE | \ G_MIRROR_DISK_FLAG_CANDELETE) #define G_MIRROR_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_MIRROR_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL #define G_MIRROR_DEVICE_FLAG_MASK (G_MIRROR_DEVICE_FLAG_NOAUTOSYNC | \ G_MIRROR_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL -extern u_int g_mirror_debug; +extern int g_mirror_debug; #define G_MIRROR_DEBUG(lvl, ...) do { \ if (g_mirror_debug >= (lvl)) { \ printf("GEOM_MIRROR"); \ if (g_mirror_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_MIRROR_LOGREQ(lvl, bp, ...) do { \ if (g_mirror_debug >= (lvl)) { \ printf("GEOM_MIRROR"); \ if (g_mirror_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #define G_MIRROR_BIO_FLAG_REGULAR 0x01 #define G_MIRROR_BIO_FLAG_SYNC 0x02 /* * Informations needed for synchronization. */ struct g_mirror_disk_sync { struct g_consumer *ds_consumer; /* Consumer connected to our mirror. */ off_t ds_offset; /* Offset of next request to send. */ off_t ds_offset_done; /* Offset of already synchronized region. */ u_int ds_syncid; /* Disk's synchronization ID. */ u_int ds_inflight; /* Number of in-flight sync requests. */ struct bio **ds_bios; /* BIOs for synchronization I/O. */ }; /* * Informations needed for synchronization. */ struct g_mirror_device_sync { struct g_geom *ds_geom; /* Synchronization geom. */ u_int ds_ndisks; /* Number of disks in SYNCHRONIZING state. */ }; #define G_MIRROR_DISK_STATE_NONE 0 #define G_MIRROR_DISK_STATE_NEW 1 #define G_MIRROR_DISK_STATE_ACTIVE 2 #define G_MIRROR_DISK_STATE_STALE 3 #define G_MIRROR_DISK_STATE_SYNCHRONIZING 4 #define G_MIRROR_DISK_STATE_DISCONNECTED 5 #define G_MIRROR_DISK_STATE_DESTROY 6 struct g_mirror_disk { uint32_t d_id; /* Disk ID. */ struct g_consumer *d_consumer; /* Consumer. */ struct g_mirror_softc *d_softc; /* Back-pointer to softc. */ int d_state; /* Disk state. */ u_int d_priority; /* Disk priority. */ u_int load; /* Averaged queue length */ off_t d_last_offset; /* Last read offset */ uint64_t d_flags; /* Additional flags. */ u_int d_genid; /* Disk's generation ID. */ struct g_mirror_disk_sync d_sync;/* Sync information. */ LIST_ENTRY(g_mirror_disk) d_next; }; #define d_name d_consumer->provider->name #define G_MIRROR_EVENT_DONTWAIT 0x1 #define G_MIRROR_EVENT_WAIT 0x2 #define G_MIRROR_EVENT_DEVICE 0x4 #define G_MIRROR_EVENT_DONE 0x8 struct g_mirror_event { struct g_mirror_disk *e_disk; int e_state; int e_flags; int e_error; TAILQ_ENTRY(g_mirror_event) e_next; }; #define G_MIRROR_DEVICE_FLAG_DESTROY 0x0100000000000000ULL #define G_MIRROR_DEVICE_FLAG_DRAIN 0x0200000000000000ULL #define G_MIRROR_DEVICE_FLAG_CLOSEWAIT 0x0400000000000000ULL #define G_MIRROR_DEVICE_FLAG_TASTING 0x0800000000000000ULL #define G_MIRROR_DEVICE_FLAG_WIPE 0x1000000000000000ULL #define G_MIRROR_DEVICE_STATE_STARTING 0 #define G_MIRROR_DEVICE_STATE_RUNNING 1 #define G_MIRROR_TYPE_MANUAL 0 #define G_MIRROR_TYPE_AUTOMATIC 1 /* Bump syncid on first write. */ #define G_MIRROR_BUMP_SYNCID 0x1 /* Bump genid immediately. */ #define G_MIRROR_BUMP_GENID 0x2 /* Bump syncid immediately. */ #define G_MIRROR_BUMP_SYNCID_NOW 0x4 struct g_mirror_softc { u_int sc_type; /* Device type (manual/automatic). */ u_int sc_state; /* Device state. */ uint32_t sc_slice; /* Slice size. */ uint8_t sc_balance; /* Balance algorithm. */ uint64_t sc_mediasize; /* Device size. */ uint32_t sc_sectorsize; /* Sector size. */ uint64_t sc_flags; /* Additional flags. */ struct g_geom *sc_geom; struct g_provider *sc_provider; int sc_provider_open; uint32_t sc_id; /* Mirror unique ID. */ struct sx sc_lock; struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; struct proc *sc_worker; struct bio_queue_head sc_regular_delayed; /* Delayed I/O requests due collision with sync requests. */ struct bio_queue_head sc_inflight; /* In-flight regular write requests. */ struct bio_queue_head sc_sync_delayed; /* Delayed sync requests due collision with regular requests. */ LIST_HEAD(, g_mirror_disk) sc_disks; u_int sc_ndisks; /* Number of disks. */ struct g_mirror_disk *sc_hint; u_int sc_genid; /* Generation ID. */ u_int sc_syncid; /* Synchronization ID. */ int sc_bump_id; struct g_mirror_device_sync sc_sync; int sc_idle; /* DIRTY flags removed. */ time_t sc_last_write; u_int sc_writes; u_int sc_refcnt; /* Number of softc references */ TAILQ_HEAD(, g_mirror_event) sc_events; struct mtx sc_events_mtx; struct callout sc_callout; struct root_hold_token *sc_rootmount; struct mtx sc_done_mtx; }; #define sc_name sc_geom->name struct g_mirror_metadata; u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state); struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type); #define G_MIRROR_DESTROY_SOFT 0 #define G_MIRROR_DESTROY_DELAYED 1 #define G_MIRROR_DESTROY_HARD 2 int g_mirror_destroy(struct g_mirror_softc *sc, int how); int g_mirror_event_send(void *arg, int state, int flags); struct g_mirror_metadata; int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md); int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md); void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md); void g_mirror_update_metadata(struct g_mirror_disk *disk); g_ctl_req_t g_mirror_config; #endif /* _KERNEL */ struct g_mirror_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Mirror name. */ uint32_t md_mid; /* Mirror unique ID. */ uint32_t md_did; /* Disk unique ID. */ uint8_t md_all; /* Number of disks in mirror. */ uint32_t md_genid; /* Generation ID. */ uint32_t md_syncid; /* Synchronization ID. */ uint8_t md_priority; /* Disk priority. */ uint32_t md_slice; /* Slice size. */ uint8_t md_balance; /* Balance type. */ uint64_t md_mediasize; /* Size of the smallest disk in mirror. */ uint32_t md_sectorsize; /* Sector size. */ uint64_t md_sync_offset; /* Synchronized offset. */ uint64_t md_mflags; /* Additional mirror flags. */ uint64_t md_dflags; /* Additional disk flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void mirror_metadata_encode(struct g_mirror_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, 16); le32enc(data + 36, md->md_mid); le32enc(data + 40, md->md_did); *(data + 44) = md->md_all; le32enc(data + 45, md->md_genid); le32enc(data + 49, md->md_syncid); *(data + 53) = md->md_priority; le32enc(data + 54, md->md_slice); *(data + 58) = md->md_balance; le64enc(data + 59, md->md_mediasize); le32enc(data + 67, md->md_sectorsize); le64enc(data + 71, md->md_sync_offset); le64enc(data + 79, md->md_mflags); le64enc(data + 87, md->md_dflags); bcopy(md->md_provider, data + 95, 16); le64enc(data + 111, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 119); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 119, 16); } static __inline int mirror_metadata_decode_v0v1(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_syncid = le32dec(data + 45); md->md_priority = *(data + 49); md->md_slice = le32dec(data + 50); md->md_balance = *(data + 54); md->md_mediasize = le64dec(data + 55); md->md_sectorsize = le32dec(data + 63); md->md_sync_offset = le64dec(data + 67); md->md_mflags = le64dec(data + 75); md->md_dflags = le64dec(data + 83); bcopy(data + 91, md->md_provider, 16); bcopy(data + 107, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 107); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 107, 16) != 0) return (EINVAL); /* New fields. */ md->md_genid = 0; md->md_provsize = 0; return (0); } static __inline int mirror_metadata_decode_v2(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_genid = le32dec(data + 45); md->md_syncid = le32dec(data + 49); md->md_priority = *(data + 53); md->md_slice = le32dec(data + 54); md->md_balance = *(data + 58); md->md_mediasize = le64dec(data + 59); md->md_sectorsize = le32dec(data + 67); md->md_sync_offset = le64dec(data + 71); md->md_mflags = le64dec(data + 79); md->md_dflags = le64dec(data + 87); bcopy(data + 95, md->md_provider, 16); bcopy(data + 111, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 111); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 111, 16) != 0) return (EINVAL); /* New fields. */ md->md_provsize = 0; return (0); } static __inline int mirror_metadata_decode_v3v4(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_genid = le32dec(data + 45); md->md_syncid = le32dec(data + 49); md->md_priority = *(data + 53); md->md_slice = le32dec(data + 54); md->md_balance = *(data + 58); md->md_mediasize = le64dec(data + 59); md->md_sectorsize = le32dec(data + 67); md->md_sync_offset = le64dec(data + 71); md->md_mflags = le64dec(data + 79); md->md_dflags = le64dec(data + 87); bcopy(data + 95, md->md_provider, 16); md->md_provsize = le64dec(data + 111); bcopy(data + 119, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 119); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 119, 16) != 0) return (EINVAL); return (0); } static __inline int mirror_metadata_decode(const u_char *data, struct g_mirror_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: case 1: error = mirror_metadata_decode_v0v1(data, md); break; case 2: error = mirror_metadata_decode_v2(data, md); break; case 3: case 4: error = mirror_metadata_decode_v3v4(data, md); break; default: error = EINVAL; break; } return (error); } static __inline const char * balance_name(u_int balance) { static const char *algorithms[] = { [G_MIRROR_BALANCE_NONE] = "none", [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", [G_MIRROR_BALANCE_LOAD] = "load", [G_MIRROR_BALANCE_SPLIT] = "split", [G_MIRROR_BALANCE_PREFER] = "prefer", [G_MIRROR_BALANCE_MAX + 1] = "unknown" }; if (balance > G_MIRROR_BALANCE_MAX) balance = G_MIRROR_BALANCE_MAX + 1; return (algorithms[balance]); } static __inline int balance_id(const char *name) { static const char *algorithms[] = { [G_MIRROR_BALANCE_NONE] = "none", [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", [G_MIRROR_BALANCE_LOAD] = "load", [G_MIRROR_BALANCE_SPLIT] = "split", [G_MIRROR_BALANCE_PREFER] = "prefer" }; int n; for (n = G_MIRROR_BALANCE_MIN; n <= G_MIRROR_BALANCE_MAX; n++) { if (strcmp(name, algorithms[n]) == 0) return (n); } return (-1); } static __inline void mirror_metadata_dump(const struct g_mirror_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" name: %s\n", md->md_name); printf(" mid: %u\n", (u_int)md->md_mid); printf(" did: %u\n", (u_int)md->md_did); printf(" all: %u\n", (u_int)md->md_all); printf(" genid: %u\n", (u_int)md->md_genid); printf(" syncid: %u\n", (u_int)md->md_syncid); printf(" priority: %u\n", (u_int)md->md_priority); printf(" slice: %u\n", (u_int)md->md_slice); printf(" balance: %s\n", balance_name((u_int)md->md_balance)); printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); printf(" mflags:"); if (md->md_mflags == 0) printf(" NONE"); else { if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) printf(" NOFAILSYNC"); if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) printf(" NOAUTOSYNC"); } printf("\n"); printf(" dflags:"); if (md->md_dflags == 0) printf(" NONE"); else { if ((md->md_dflags & G_MIRROR_DISK_FLAG_DIRTY) != 0) printf(" DIRTY"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) printf(" SYNCHRONIZING"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) printf(" FORCE_SYNC"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) printf(" INACTIVE"); } printf("\n"); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_MIRROR_H_ */ Index: projects/bsd_rdma_4_9/sys/i386/i386/machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/i386/i386/machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/i386/i386/machdep.c (revision 326162) @@ -1,3045 +1,3039 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_perfmon.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_APIC #include #endif #ifdef DEV_ISA #include #endif /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern register_t init386(int first); extern void dblfault_handler(void); static void cpu_startup(void *); static void fpstate_drop(struct thread *td); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel; u_int basemem; int cold = 1; #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif long Maxmem = 0; long realmem = 0; #ifdef PAE FEATURE(pae, "Physical Address Extensions"); #endif /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; /* Default init_ops implementation. */ struct init_ops init_ops = { .early_clock_source_init = i8254_init, .early_delay = i8254_delay, #ifdef DEV_APIC .msi_init = msi_init, #endif }; static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_cnt.v_free_count), ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct osigframe sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; sf.sf_addr = 0; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)fp; if (p->p_sysent->sv_sigcode_base != 0) { regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - szosigcode; } else { /* a.out sysentvec does not use shared page */ regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; } regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe4 sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); bzero(sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); bzero(sf.sf_uc.uc_mcontext.__spare__, sizeof(sf.sf_uc.uc_mcontext.__spare__)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe4)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe4 *)regs->tf_esp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; sf.sf_si.si_addr = ksi->ksi_addr; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - szfreebsd4_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; struct segment_descriptor *sdp; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, ksi, mask); return; } #endif regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); /* * Unconditionally fill the fsbase and gsbase into the mcontext. */ sdp = &td->td_pcb->pcb_fsd; sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; bzero(sf.sf_uc.uc_mcontext.mc_spare2, sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_esp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned int)sp & ~0x3F); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned int)sp & ~0xF); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = p->p_sysent->sv_sigcode_base; if (regs->tf_eip == 0) regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ #ifdef COMPAT_43 int osigreturn(td, uap) struct thread *td; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct osigcontext sc; struct trapframe *regs; struct osigcontext *scp; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; #if defined(COMPAT_43) if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, SIGPROCMASK_OLD); return (EJUSTRETURN); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_sigreturn(td, uap) struct thread *td; struct freebsd4_sigreturn_args /* { const ucontext4 *sigcntxp; } */ *uap; { struct ucontext4 uc; struct trapframe *regs; struct ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; int cs, eflags, error, ret; ksiginfo_t ksi; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(union savefpu)) { uprintf( "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); mtx_lock_spin(&dt_lock); if (td->td_proc->p_md.md_ldt) user_ldt_free(td); else mtx_unlock_spin(&dt_lock); /* * Reset the fs and gs bases. The values from the old address * space do not make sense for the new program. In particular, * gsbase might be the TLS base for the old program but the new * program has no TLS now. */ set_fsbase(td, 0); set_gsbase(td, 0); + /* Make sure edx is 0x0 on entry. Linux binaries depend on it. */ bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = imgp->entry_addr; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = imgp->ps_strings; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); - - /* - * XXX - Linux emulator - * Make sure sure edx is 0x0 on entry. Linux binaries depend - * on it. - */ - td->td_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: * * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT * instructions. We must set the CR0_MP bit and use the CR0_TS * bit to control the trap, because setting the CR0_EM bit does * not cause WAIT instructions to trap. It's important to trap * WAIT instructions - otherwise the "wait" variants of no-wait * control instructions would degenerate to the "no-wait" variants * after FP context switches but work correctly otherwise. It's * particularly important to trap WAITs when there is no NPX - * otherwise the "wait" variants would always degenerate. * * Try setting CR0_NE to get correct error reporting on 486DX's. * Setting it should fail or do nothing on lesser processors. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } u_long bootdev; /* not a struct cdev *- encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); static char bootmethod[16] = "BIOS"; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ struct mtx dt_lock; /* lock for GDT and LDT */ static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. * * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = SEL_KPL, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUFS_SEL 2 %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS_SEL 3 %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 6 Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { .ssd_base = 0x400, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GLDT_SEL 10 LDT Descriptor */ { .ssd_base = (int) ldt, .ssd_limit = sizeof(ldt)-1, .ssd_type = SDT_SYSLDT, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ { .ssd_base = (int) ldt, .ssd_limit = (512 * sizeof(union descriptor)-1), .ssd_type = SDT_SYSLDT, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPANIC_SEL 12 Panic Tss Descriptor */ { .ssd_base = (int) &dblfault_tss, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GNDIS_SEL 18 NDIS Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = (ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { uint64_t idtr, gdtr; idtr = ridt(); db_printf("idtr\t0x%08x/%04x\n", (u_int)(idtr >> 16), (u_int)idtr & 0xffff); gdtr = rgdt(); db_printf("gdtr\t0x%08x/%04x\n", (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); db_printf("ldtr\t0x%04x\n", rldt()); db_printf("tr\t0x%04x\n", rtr()); db_printf("cr0\t0x%08x\n", rcr0()); db_printf("cr2\t0x%08x\n", rcr2()); db_printf("cr3\t0x%08x\n", rcr3()); db_printf("cr4\t0x%08x\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016llx\n", rxcr(0)); if (amd_feature & (AMDID_NX | AMDID_LM)) db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t0x%016llx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); if ((cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); if (cpu_feature & CPUID_PAT) db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%08x\n", rdr0()); db_printf("dr1\t0x%08x\n", rdr1()); db_printf("dr2\t0x%08x\n", rdr2()); db_printf("dr3\t0x%08x\n", rdr3()); db_printf("dr6\t0x%08x\n", rdr6()); db_printf("dr7\t0x%08x\n", rdr7()); } #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); #ifndef PAE if (base > 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(length / 1024)); return (1); } #endif /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = physmap_idx + 2; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } static int add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016llx len=%016llx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) return (1); return (add_physmap_entry(smap->base, smap->length, physmap, physmap_idxp)); } static void add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, int *physmap_idxp) { struct bios_smap *smap, *smapend; u_int32_t smapsize; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes SMAP. */ smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) if (!add_smap_entry(smap, physmap, physmap_idxp)) break; } static void basemem_setup(void) { vm_paddr_t pa; pt_entry_t *pte; int i; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) pmap_kenter(KERNBASE + pa, pa); /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using * the vm86 map too. XXX: why 2 ways for this and only 1 way for * page 0, at least as initialized here? */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int has_smap, off, physmap_idx, pa_indx, da_indx; u_long memtest; vm_paddr_t physmap[PHYSMAP_SIZE]; pt_entry_t *pte; quad_t dcons_addr, dcons_size, physmem_tunable; int hasbrokenint12, i, res; u_int extmem; struct vm86frame vmf; struct vm86context vmc; vm_paddr_t pa; struct bios_smap *smap, *smapbase; caddr_t kmdp; has_smap = 0; bzero(&vmf, sizeof(vmf)); bzero(physmap, sizeof(physmap)); basemem = 0; /* * Check if the loader supplied an SMAP memory map. If so, * use that and do not make any VM86 calls. */ physmap_idx = 0; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { add_smap_entries(smapbase, physmap, &physmap_idx); has_smap = 1; goto have_smap; } /* * Some newer BIOSes have a broken INT 12H implementation * which causes a kernel panic immediately. In this case, we * need use the SMAP to determine the base memory size. */ hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); if (hasbrokenint12 == 0) { /* Use INT12 to determine base memory size. */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; basemem_setup(); } /* * Fetch the memory map with INT 15:E820. Map page 1 R/W into * the kernel page table so we can use it as a buffer. The * kernel will unmap this page later. */ pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); KASSERT(res != 0, ("vm86_getptr() failed: address not found")); vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; has_smap = 1; if (!add_smap_entry(smap, physmap, &physmap_idx)) break; } while (vmf.vmf_ebx != 0); have_smap: /* * If we didn't fetch the "base memory" size from INT12, * figure it out from the SMAP (or just guess). */ if (basemem == 0) { for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } /* XXX: If we couldn't find basemem from SMAP, just guess. */ if (basemem == 0) basemem = 640; basemem_setup(); } if (physmap[1] != 0) goto physmap_done; /* * If we failed to find an SMAP, figure out the extended * memory size. We will then build a simple memory map with * two segments, one for "base memory" and the second for * "extended memory". Note that "extended memory" starts at a * physical address of 1MB and that both basemem and extmem * are in units of 1KB. * * First, try to fetch the extended memory size via INT 15:E801. */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { /* * If INT15:E801 fails, this is our last ditch effort * to determine the extended memory size. Currently * we prefer the RTC value over INT15:88. */ #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1]); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. * * This is especially confusing when it is much larger than the * memory size and is displayed as "realmem". */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend * the amount of memory in the system. */ if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); /* * By default enable the memory test on real hardware, and disable * it if we appear to be running in a VM. This avoids touching all * pages unnecessarily, which doesn't matter on real hardware but is * bad for shared VM hosts. Use a general name so that * one could eventually do more with the code than just disable it. */ memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP3; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR3; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= KERNLOAD && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } static void i386_kdb_init(void) { #ifdef DDB db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } register_t init386(int first) { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, x, pa; struct pcpu *pc; struct xstate_hdr *xhdr; int late_console; thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = TD0_KSTACK_PAGES; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp != 0) init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0); else init_static_kenv(NULL, 0); identify_hypervisor(); /* Init basic tunables, hz etc */ init_param1(); /* * Make gdt memory segments. All segments cover the full 4GB * of address space and permissions are enforced at page level. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); pc = &__pcpu[0]; gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); lgdt(&r_gdt); pcpu_init(pc, 0, sizeof(struct pcpu)); for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) pmap_kenter(pa + KERNBASE, pa); dpcpu_init((void *)(first + KERNBASE), 0); first += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); /* make ldt memory segments */ ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); for (x = 0; x < nitems(ldt_segs); x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); finishidentcpu(); /* Final stage of CPU initialization */ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); #if defined(PAE) || defined(PAE_TABLES) dblfault_tss.tss_cr3 = (int)IdlePDPT; #else dblfault_tss.tss_cr3 = (int)IdlePTD; #endif dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); /* Initialize the tss (except for the final esp0) early for vm86. */ PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); /* Initialize the PIC early for vm86 calls. */ #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #endif /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); i386_kdb_init(); } vm86_initialize(); getmemsize(first); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ if (late_console) cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); if (late_console) i386_kdb_init(); msgbufinit(msgbufp, msgbufsize); npxinit(true); /* * Set up thread0 pcb after npxinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } PCPU_SET(curpcb, thread0.td_pcb); /* Move esp0 in the tss to its final place. */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ ltr(gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; #if defined(PAE) || defined(PAE_TABLES) thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; #else thread0.td_pcb->pcb_cr3 = (int)IdlePTD; #endif thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; cpu_probe_amdc1e(); #ifdef FDT x86_init_fdt(); #endif /* Location of kernel stack for locore */ return ((register_t)thread0.td_pcb); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; critical_exit(); flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(flags); } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); if (tmp == 0) panic("kmem_malloc returned 0"); /* Put the problematic entry (#6) at the end of the lower page. */ new_idt = (struct gate_descriptor*) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; lidt(&r_idt); idt = new_idt; pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_edi = tf->tf_edi; pcb->pcb_esi = tf->tf_esi; pcb->pcb_ebp = tf->tf_ebp; pcb->pcb_ebx = tf->tf_ebx; pcb->pcb_eip = tf->tf_eip; pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; pcb->pcb_gs = rgs(); } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_eip = addr; return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_eflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_eflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; pcb = td->td_pcb; regs->r_gs = pcb->pcb_gs; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); pcb = td->td_pcb; tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb->pcb_gs = regs->r_gs; return (0); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); npxgetregs(td); if (cpu_fxsr) npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, (struct save87 *)fpregs); else bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, sizeof(*fpregs)); return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { if (cpu_fxsr) npx_set_fpregs_xmm((struct save87 *)fpregs, &get_pcb_user_save_td(td)->sv_xmm); else bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, sizeof(*fpregs)); npxuserinited(td); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct trapframe *tp; struct segment_descriptor *sdp; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_esp); PROC_UNLOCK(curthread->td_proc); mcp->mc_gs = td->td_pcb->pcb_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_edi; mcp->mc_esi = tp->tf_esi; mcp->mc_ebp = tp->tf_ebp; mcp->mc_isp = tp->tf_isp; mcp->mc_eflags = tp->tf_eflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_eax; mcp->mc_edx = tp->tf_edx; } mcp->mc_ebx = tp->tf_ebx; mcp->mc_ecx = tp->tf_ecx; mcp->mc_eip = tp->tf_eip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_esp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); sdp = &td->td_pcb->pcb_fsd; mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; mcp->mc_flags = 0; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tp; char *xfpustate; int eflags, ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); eflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_eflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(union savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_edi = mcp->mc_edi; tp->tf_esi = mcp->mc_esi; tp->tf_ebp = mcp->mc_ebp; tp->tf_ebx = mcp->mc_ebx; tp->tf_edx = mcp->mc_edx; tp->tf_ecx = mcp->mc_ecx; tp->tf_eax = mcp->mc_eax; tp->tf_eip = mcp->mc_eip; tp->tf_eflags = eflags; tp->tf_esp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_gs = mcp->mc_gs; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = npxgetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = npxformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(union savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_387 && mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } static void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) npxdrop(); /* * XXX force a full drop of the npx. The above only drops it if we * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. * * XXX I don't much like npxgetregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of npxgetregs()... perhaps we just * have too many layers. */ curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) return (EINVAL); } pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ Index: projects/bsd_rdma_4_9/sys/i386/include/atomic.h =================================================================== --- projects/bsd_rdma_4_9/sys/i386/include/atomic.h (revision 326161) +++ projects/bsd_rdma_4_9/sys/i386/include/atomic.h (revision 326162) @@ -1,854 +1,866 @@ /*- * Copyright (c) 1998 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_ATOMIC_H_ #define _MACHINE_ATOMIC_H_ #ifndef _SYS_CDEFS_H_ #error this file needs sys/cdefs.h as a prerequisite #endif #ifdef _KERNEL #include #include #endif #ifndef __OFFSETOF_MONITORBUF /* * __OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf). * * The open-coded number is used instead of the symbolic expression to * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers. * An assertion in i386/vm_machdep.c ensures that the value is correct. */ #define __OFFSETOF_MONITORBUF 0x80 static __inline void __mbk(void) { __asm __volatile("lock; addl $0,%%fs:%0" : "+m" (*(u_int *)__OFFSETOF_MONITORBUF) : : "memory", "cc"); } static __inline void __mbu(void) { __asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc"); } #endif /* * Various simple operations on memory, each of which is atomic in the * presence of interrupts and multiple processors. * * atomic_set_char(P, V) (*(u_char *)(P) |= (V)) * atomic_clear_char(P, V) (*(u_char *)(P) &= ~(V)) * atomic_add_char(P, V) (*(u_char *)(P) += (V)) * atomic_subtract_char(P, V) (*(u_char *)(P) -= (V)) * * atomic_set_short(P, V) (*(u_short *)(P) |= (V)) * atomic_clear_short(P, V) (*(u_short *)(P) &= ~(V)) * atomic_add_short(P, V) (*(u_short *)(P) += (V)) * atomic_subtract_short(P, V) (*(u_short *)(P) -= (V)) * * atomic_set_int(P, V) (*(u_int *)(P) |= (V)) * atomic_clear_int(P, V) (*(u_int *)(P) &= ~(V)) * atomic_add_int(P, V) (*(u_int *)(P) += (V)) * atomic_subtract_int(P, V) (*(u_int *)(P) -= (V)) * atomic_swap_int(P, V) (return (*(u_int *)(P)); *(u_int *)(P) = (V);) * atomic_readandclear_int(P) (return (*(u_int *)(P)); *(u_int *)(P) = 0;) * * atomic_set_long(P, V) (*(u_long *)(P) |= (V)) * atomic_clear_long(P, V) (*(u_long *)(P) &= ~(V)) * atomic_add_long(P, V) (*(u_long *)(P) += (V)) * atomic_subtract_long(P, V) (*(u_long *)(P) -= (V)) * atomic_swap_long(P, V) (return (*(u_long *)(P)); *(u_long *)(P) = (V);) * atomic_readandclear_long(P) (return (*(u_long *)(P)); *(u_long *)(P) = 0;) */ /* * The above functions are expanded inline in the statically-linked * kernel. Lock prefixes are generated if an SMP kernel is being * built. * * Kernel modules call real functions which are built into the kernel. * This allows kernel modules to be portable between UP and SMP systems. */ #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM) #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v); \ void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_char(volatile u_char *dst, u_char expect, u_char src); int atomic_cmpset_short(volatile u_short *dst, u_short expect, u_short src); int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src); int atomic_fcmpset_char(volatile u_char *dst, u_char *expect, u_char src); int atomic_fcmpset_short(volatile u_short *dst, u_short *expect, u_short src); int atomic_fcmpset_int(volatile u_int *dst, u_int *expect, u_int src); u_int atomic_fetchadd_int(volatile u_int *p, u_int v); int atomic_testandset_int(volatile u_int *p, u_int v); int atomic_testandclear_int(volatile u_int *p, u_int v); void atomic_thread_fence_acq(void); void atomic_thread_fence_acq_rel(void); void atomic_thread_fence_rel(void); void atomic_thread_fence_seq_cst(void); #define ATOMIC_LOAD(TYPE) \ u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p) #define ATOMIC_STORE(TYPE) \ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_64(volatile uint64_t *, uint64_t, uint64_t); uint64_t atomic_load_acq_64(volatile uint64_t *); void atomic_store_rel_64(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); +uint64_t atomic_fetchadd_64(volatile uint64_t *, uint64_t); #else /* !KLD_MODULE && __GNUCLIKE_ASM */ /* * For userland, always use lock prefixes so that the binaries will run * on both SMP and !SMP systems. */ #if defined(SMP) || !defined(_KERNEL) #define MPLOCKED "lock ; " #else #define MPLOCKED #endif /* * The assembly is volatilized to avoid code chunk removal by the compiler. * GCC aggressively reorders operations and memory clobbering is necessary * in order to avoid that for memory barriers. */ #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ static __inline void \ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "cc"); \ } \ \ static __inline void \ atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "memory", "cc"); \ } \ struct __hack /* * Atomic compare and set, used by the mutex functions. * * cmpset: * if (*dst == expect) * *dst = src * * fcmpset: * if (*dst == *expect) * *dst = src * else * *expect = *dst * * Returns 0 on failure, non-zero on success. */ #define ATOMIC_CMPSET(TYPE, CONS) \ static __inline int \ atomic_cmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE expect, u_##TYPE src) \ { \ u_char res; \ \ __asm __volatile( \ " " MPLOCKED " " \ " cmpxchg %3,%1 ; " \ " sete %0 ; " \ "# atomic_cmpset_" #TYPE " " \ : "=q" (res), /* 0 */ \ "+m" (*dst), /* 1 */ \ "+a" (expect) /* 2 */ \ : CONS (src) /* 3 */ \ : "memory", "cc"); \ return (res); \ } \ \ static __inline int \ atomic_fcmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE *expect, u_##TYPE src) \ { \ u_char res; \ \ __asm __volatile( \ " " MPLOCKED " " \ " cmpxchg %3,%1 ; " \ " sete %0 ; " \ "# atomic_fcmpset_" #TYPE " " \ : "=q" (res), /* 0 */ \ "+m" (*dst), /* 1 */ \ "+a" (*expect) /* 2 */ \ : CONS (src) /* 3 */ \ : "memory", "cc"); \ return (res); \ } ATOMIC_CMPSET(char, "q"); ATOMIC_CMPSET(short, "r"); ATOMIC_CMPSET(int, "r"); /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. */ static __inline u_int atomic_fetchadd_int(volatile u_int *p, u_int v) { __asm __volatile( " " MPLOCKED " " " xaddl %0,%1 ; " "# atomic_fetchadd_int" : "+r" (v), /* 0 */ "+m" (*p) /* 1 */ : : "cc"); return (v); } static __inline int atomic_testandset_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btsl %2,%1 ; " " setc %0 ; " "# atomic_testandset_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } static __inline int atomic_testandclear_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btrl %2,%1 ; " " setc %0 ; " "# atomic_testandclear_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } /* * We assume that a = b will do atomic loads and stores. Due to the * IA32 memory model, a simple store guarantees release semantics. * * However, a load may pass a store if they are performed on distinct * addresses, so we need Store/Load barrier for sequentially * consistent fences in SMP kernels. We use "lock addl $0,mem" for a * Store/Load barrier, as recommended by the AMD Software Optimization * Guide, and not mfence. In the kernel, we use a private per-cpu * cache line for "mem", to avoid introducing false data * dependencies. In user space, we use the word at the top of the * stack. * * For UP kernels, however, the memory of the single processor is * always consistent, so we only need to stop the compiler from * reordering accesses in a way that violates the semantics of acquire * and release. */ #if defined(_KERNEL) #if defined(SMP) #define __storeload_barrier() __mbk() #else /* _KERNEL && UP */ #define __storeload_barrier() __compiler_membar() #endif /* SMP */ #else /* !_KERNEL */ #define __storeload_barrier() __mbu() #endif /* _KERNEL*/ #define ATOMIC_LOAD(TYPE) \ static __inline u_##TYPE \ atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ { \ u_##TYPE res; \ \ res = *p; \ __compiler_membar(); \ return (res); \ } \ struct __hack #define ATOMIC_STORE(TYPE) \ static __inline void \ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \ { \ \ __compiler_membar(); \ *p = v; \ } \ struct __hack static __inline void atomic_thread_fence_acq(void) { __compiler_membar(); } static __inline void atomic_thread_fence_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_acq_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_seq_cst(void) { __storeload_barrier(); } #ifdef _KERNEL #ifdef WANT_FUNCTIONS int atomic_cmpset_64_i386(volatile uint64_t *, uint64_t, uint64_t); int atomic_cmpset_64_i586(volatile uint64_t *, uint64_t, uint64_t); uint64_t atomic_load_acq_64_i386(volatile uint64_t *); uint64_t atomic_load_acq_64_i586(volatile uint64_t *); void atomic_store_rel_64_i386(volatile uint64_t *, uint64_t); void atomic_store_rel_64_i586(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64_i386(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64_i586(volatile uint64_t *, uint64_t); #endif /* I486 does not support SMP or CMPXCHG8B. */ static __inline int atomic_cmpset_64_i386(volatile uint64_t *dst, uint64_t expect, uint64_t src) { volatile uint32_t *p; u_char res; p = (volatile uint32_t *)dst; __asm __volatile( " pushfl ; " " cli ; " " xorl %1,%%eax ; " " xorl %2,%%edx ; " " orl %%edx,%%eax ; " " jne 1f ; " " movl %4,%1 ; " " movl %5,%2 ; " "1: " " sete %3 ; " " popfl" : "+A" (expect), /* 0 */ "+m" (*p), /* 1 */ "+m" (*(p + 1)), /* 2 */ "=q" (res) /* 3 */ : "r" ((uint32_t)src), /* 4 */ "r" ((uint32_t)(src >> 32)) /* 5 */ : "memory", "cc"); return (res); } static __inline uint64_t atomic_load_acq_64_i386(volatile uint64_t *p) { volatile uint32_t *q; uint64_t res; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %1,%%eax ; " " movl %2,%%edx ; " " popfl" : "=&A" (res) /* 0 */ : "m" (*q), /* 1 */ "m" (*(q + 1)) /* 2 */ : "memory"); return (res); } static __inline void atomic_store_rel_64_i386(volatile uint64_t *p, uint64_t v) { volatile uint32_t *q; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %%eax,%0 ; " " movl %%edx,%1 ; " " popfl" : "=m" (*q), /* 0 */ "=m" (*(q + 1)) /* 1 */ : "A" (v) /* 2 */ : "memory"); } static __inline uint64_t atomic_swap_64_i386(volatile uint64_t *p, uint64_t v) { volatile uint32_t *q; uint64_t res; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %1,%%eax ; " " movl %2,%%edx ; " " movl %4,%2 ; " " movl %3,%1 ; " " popfl" : "=&A" (res), /* 0 */ "+m" (*q), /* 1 */ "+m" (*(q + 1)) /* 2 */ : "r" ((uint32_t)v), /* 3 */ "r" ((uint32_t)(v >> 32))); /* 4 */ return (res); } static __inline int atomic_cmpset_64_i586(volatile uint64_t *dst, uint64_t expect, uint64_t src) { u_char res; __asm __volatile( " " MPLOCKED " " " cmpxchg8b %1 ; " " sete %0" : "=q" (res), /* 0 */ "+m" (*dst), /* 1 */ "+A" (expect) /* 2 */ : "b" ((uint32_t)src), /* 3 */ "c" ((uint32_t)(src >> 32)) /* 4 */ : "memory", "cc"); return (res); } static __inline uint64_t atomic_load_acq_64_i586(volatile uint64_t *p) { uint64_t res; __asm __volatile( " movl %%ebx,%%eax ; " " movl %%ecx,%%edx ; " " " MPLOCKED " " " cmpxchg8b %1" : "=&A" (res), /* 0 */ "+m" (*p) /* 1 */ : : "memory", "cc"); return (res); } static __inline void atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v) { __asm __volatile( " movl %%eax,%%ebx ; " " movl %%edx,%%ecx ; " "1: " " " MPLOCKED " " " cmpxchg8b %0 ; " " jne 1b" : "+m" (*p), /* 0 */ "+A" (v) /* 1 */ : : "ebx", "ecx", "memory", "cc"); } static __inline uint64_t atomic_swap_64_i586(volatile uint64_t *p, uint64_t v) { __asm __volatile( " movl %%eax,%%ebx ; " " movl %%edx,%%ecx ; " "1: " " " MPLOCKED " " " cmpxchg8b %0 ; " " jne 1b" : "+m" (*p), /* 0 */ "+A" (v) /* 1 */ : : "ebx", "ecx", "memory", "cc"); return (v); } static __inline int atomic_cmpset_64(volatile uint64_t *dst, uint64_t expect, uint64_t src) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_cmpset_64_i386(dst, expect, src)); else return (atomic_cmpset_64_i586(dst, expect, src)); } static __inline uint64_t atomic_load_acq_64(volatile uint64_t *p) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_load_acq_64_i386(p)); else return (atomic_load_acq_64_i586(p)); } static __inline void atomic_store_rel_64(volatile uint64_t *p, uint64_t v) { if ((cpu_feature & CPUID_CX8) == 0) atomic_store_rel_64_i386(p, v); else atomic_store_rel_64_i586(p, v); } static __inline uint64_t atomic_swap_64(volatile uint64_t *p, uint64_t v) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_swap_64_i386(p, v)); else return (atomic_swap_64_i586(p, v)); +} + +static __inline uint64_t +atomic_fetchadd_64(volatile uint64_t *p, uint64_t v) +{ + + for (;;) { + uint64_t t = *p; + if (atomic_cmpset_64(p, t, t + v)) + return (t); + } } #endif /* _KERNEL */ #endif /* KLD_MODULE || !__GNUCLIKE_ASM */ ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v); ATOMIC_ASM(clear, char, "andb %b1,%0", "iq", ~v); ATOMIC_ASM(add, char, "addb %b1,%0", "iq", v); ATOMIC_ASM(subtract, char, "subb %b1,%0", "iq", v); ATOMIC_ASM(set, short, "orw %w1,%0", "ir", v); ATOMIC_ASM(clear, short, "andw %w1,%0", "ir", ~v); ATOMIC_ASM(add, short, "addw %w1,%0", "ir", v); ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir", v); ATOMIC_ASM(set, int, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, int, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, int, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, int, "subl %1,%0", "ir", v); ATOMIC_ASM(set, long, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, long, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, long, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, long, "subl %1,%0", "ir", v); #define ATOMIC_LOADSTORE(TYPE) \ ATOMIC_LOAD(TYPE); \ ATOMIC_STORE(TYPE) ATOMIC_LOADSTORE(char); ATOMIC_LOADSTORE(short); ATOMIC_LOADSTORE(int); ATOMIC_LOADSTORE(long); #undef ATOMIC_ASM #undef ATOMIC_LOAD #undef ATOMIC_STORE #undef ATOMIC_LOADSTORE #ifndef WANT_FUNCTIONS static __inline int atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) { return (atomic_cmpset_int((volatile u_int *)dst, (u_int)expect, (u_int)src)); } static __inline u_long atomic_fetchadd_long(volatile u_long *p, u_long v) { return (atomic_fetchadd_int((volatile u_int *)p, (u_int)v)); } static __inline int atomic_testandset_long(volatile u_long *p, u_int v) { return (atomic_testandset_int((volatile u_int *)p, v)); } static __inline int atomic_testandclear_long(volatile u_long *p, u_int v) { return (atomic_testandclear_int((volatile u_int *)p, v)); } /* Read the current value and store a new value in the destination. */ #ifdef __GNUCLIKE_ASM static __inline u_int atomic_swap_int(volatile u_int *p, u_int v) { __asm __volatile( " xchgl %1,%0 ; " "# atomic_swap_int" : "+r" (v), /* 0 */ "+m" (*p)); /* 1 */ return (v); } static __inline u_long atomic_swap_long(volatile u_long *p, u_long v) { return (atomic_swap_int((volatile u_int *)p, (u_int)v)); } #else /* !__GNUCLIKE_ASM */ u_int atomic_swap_int(volatile u_int *p, u_int v); u_long atomic_swap_long(volatile u_long *p, u_long v); #endif /* __GNUCLIKE_ASM */ #define atomic_set_acq_char atomic_set_barr_char #define atomic_set_rel_char atomic_set_barr_char #define atomic_clear_acq_char atomic_clear_barr_char #define atomic_clear_rel_char atomic_clear_barr_char #define atomic_add_acq_char atomic_add_barr_char #define atomic_add_rel_char atomic_add_barr_char #define atomic_subtract_acq_char atomic_subtract_barr_char #define atomic_subtract_rel_char atomic_subtract_barr_char #define atomic_cmpset_acq_char atomic_cmpset_char #define atomic_cmpset_rel_char atomic_cmpset_char #define atomic_fcmpset_acq_char atomic_fcmpset_char #define atomic_fcmpset_rel_char atomic_fcmpset_char #define atomic_set_acq_short atomic_set_barr_short #define atomic_set_rel_short atomic_set_barr_short #define atomic_clear_acq_short atomic_clear_barr_short #define atomic_clear_rel_short atomic_clear_barr_short #define atomic_add_acq_short atomic_add_barr_short #define atomic_add_rel_short atomic_add_barr_short #define atomic_subtract_acq_short atomic_subtract_barr_short #define atomic_subtract_rel_short atomic_subtract_barr_short #define atomic_cmpset_acq_short atomic_cmpset_short #define atomic_cmpset_rel_short atomic_cmpset_short #define atomic_fcmpset_acq_short atomic_fcmpset_short #define atomic_fcmpset_rel_short atomic_fcmpset_short #define atomic_set_acq_int atomic_set_barr_int #define atomic_set_rel_int atomic_set_barr_int #define atomic_clear_acq_int atomic_clear_barr_int #define atomic_clear_rel_int atomic_clear_barr_int #define atomic_add_acq_int atomic_add_barr_int #define atomic_add_rel_int atomic_add_barr_int #define atomic_subtract_acq_int atomic_subtract_barr_int #define atomic_subtract_rel_int atomic_subtract_barr_int #define atomic_cmpset_acq_int atomic_cmpset_int #define atomic_cmpset_rel_int atomic_cmpset_int #define atomic_fcmpset_acq_int atomic_fcmpset_int #define atomic_fcmpset_rel_int atomic_fcmpset_int #define atomic_set_acq_long atomic_set_barr_long #define atomic_set_rel_long atomic_set_barr_long #define atomic_clear_acq_long atomic_clear_barr_long #define atomic_clear_rel_long atomic_clear_barr_long #define atomic_add_acq_long atomic_add_barr_long #define atomic_add_rel_long atomic_add_barr_long #define atomic_subtract_acq_long atomic_subtract_barr_long #define atomic_subtract_rel_long atomic_subtract_barr_long #define atomic_cmpset_acq_long atomic_cmpset_long #define atomic_cmpset_rel_long atomic_cmpset_long #define atomic_fcmpset_acq_long atomic_fcmpset_long #define atomic_fcmpset_rel_long atomic_fcmpset_long #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) /* Operations on 8-bit bytes. */ #define atomic_set_8 atomic_set_char #define atomic_set_acq_8 atomic_set_acq_char #define atomic_set_rel_8 atomic_set_rel_char #define atomic_clear_8 atomic_clear_char #define atomic_clear_acq_8 atomic_clear_acq_char #define atomic_clear_rel_8 atomic_clear_rel_char #define atomic_add_8 atomic_add_char #define atomic_add_acq_8 atomic_add_acq_char #define atomic_add_rel_8 atomic_add_rel_char #define atomic_subtract_8 atomic_subtract_char #define atomic_subtract_acq_8 atomic_subtract_acq_char #define atomic_subtract_rel_8 atomic_subtract_rel_char #define atomic_load_acq_8 atomic_load_acq_char #define atomic_store_rel_8 atomic_store_rel_char #define atomic_cmpset_8 atomic_cmpset_char #define atomic_cmpset_acq_8 atomic_cmpset_acq_char #define atomic_cmpset_rel_8 atomic_cmpset_rel_char #define atomic_fcmpset_8 atomic_fcmpset_char #define atomic_fcmpset_acq_8 atomic_fcmpset_acq_char #define atomic_fcmpset_rel_8 atomic_fcmpset_rel_char /* Operations on 16-bit words. */ #define atomic_set_16 atomic_set_short #define atomic_set_acq_16 atomic_set_acq_short #define atomic_set_rel_16 atomic_set_rel_short #define atomic_clear_16 atomic_clear_short #define atomic_clear_acq_16 atomic_clear_acq_short #define atomic_clear_rel_16 atomic_clear_rel_short #define atomic_add_16 atomic_add_short #define atomic_add_acq_16 atomic_add_acq_short #define atomic_add_rel_16 atomic_add_rel_short #define atomic_subtract_16 atomic_subtract_short #define atomic_subtract_acq_16 atomic_subtract_acq_short #define atomic_subtract_rel_16 atomic_subtract_rel_short #define atomic_load_acq_16 atomic_load_acq_short #define atomic_store_rel_16 atomic_store_rel_short #define atomic_cmpset_16 atomic_cmpset_short #define atomic_cmpset_acq_16 atomic_cmpset_acq_short #define atomic_cmpset_rel_16 atomic_cmpset_rel_short #define atomic_fcmpset_16 atomic_fcmpset_short #define atomic_fcmpset_acq_16 atomic_fcmpset_acq_short #define atomic_fcmpset_rel_16 atomic_fcmpset_rel_short /* Operations on 32-bit double words. */ #define atomic_set_32 atomic_set_int #define atomic_set_acq_32 atomic_set_acq_int #define atomic_set_rel_32 atomic_set_rel_int #define atomic_clear_32 atomic_clear_int #define atomic_clear_acq_32 atomic_clear_acq_int #define atomic_clear_rel_32 atomic_clear_rel_int #define atomic_add_32 atomic_add_int #define atomic_add_acq_32 atomic_add_acq_int #define atomic_add_rel_32 atomic_add_rel_int #define atomic_subtract_32 atomic_subtract_int #define atomic_subtract_acq_32 atomic_subtract_acq_int #define atomic_subtract_rel_32 atomic_subtract_rel_int #define atomic_load_acq_32 atomic_load_acq_int #define atomic_store_rel_32 atomic_store_rel_int #define atomic_cmpset_32 atomic_cmpset_int #define atomic_cmpset_acq_32 atomic_cmpset_acq_int #define atomic_cmpset_rel_32 atomic_cmpset_rel_int #define atomic_fcmpset_32 atomic_fcmpset_int #define atomic_fcmpset_acq_32 atomic_fcmpset_acq_int #define atomic_fcmpset_rel_32 atomic_fcmpset_rel_int #define atomic_swap_32 atomic_swap_int #define atomic_readandclear_32 atomic_readandclear_int #define atomic_fetchadd_32 atomic_fetchadd_int #define atomic_testandset_32 atomic_testandset_int #define atomic_testandclear_32 atomic_testandclear_int /* Operations on pointers. */ #define atomic_set_ptr(p, v) \ atomic_set_int((volatile u_int *)(p), (u_int)(v)) #define atomic_set_acq_ptr(p, v) \ atomic_set_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_set_rel_ptr(p, v) \ atomic_set_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_ptr(p, v) \ atomic_clear_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_acq_ptr(p, v) \ atomic_clear_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_rel_ptr(p, v) \ atomic_clear_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_ptr(p, v) \ atomic_add_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_acq_ptr(p, v) \ atomic_add_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_rel_ptr(p, v) \ atomic_add_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_ptr(p, v) \ atomic_subtract_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_acq_ptr(p, v) \ atomic_subtract_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_rel_ptr(p, v) \ atomic_subtract_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_load_acq_ptr(p) \ atomic_load_acq_int((volatile u_int *)(p)) #define atomic_store_rel_ptr(p, v) \ atomic_store_rel_int((volatile u_int *)(p), (v)) #define atomic_cmpset_ptr(dst, old, new) \ atomic_cmpset_int((volatile u_int *)(dst), (u_int)(old), (u_int)(new)) #define atomic_cmpset_acq_ptr(dst, old, new) \ atomic_cmpset_acq_int((volatile u_int *)(dst), (u_int)(old), \ (u_int)(new)) #define atomic_cmpset_rel_ptr(dst, old, new) \ atomic_cmpset_rel_int((volatile u_int *)(dst), (u_int)(old), \ (u_int)(new)) #define atomic_fcmpset_ptr(dst, old, new) \ atomic_fcmpset_int((volatile u_int *)(dst), (u_int *)(old), (u_int)(new)) #define atomic_fcmpset_acq_ptr(dst, old, new) \ atomic_fcmpset_acq_int((volatile u_int *)(dst), (u_int *)(old), \ (u_int)(new)) #define atomic_fcmpset_rel_ptr(dst, old, new) \ atomic_fcmpset_rel_int((volatile u_int *)(dst), (u_int *)(old), \ (u_int)(new)) #define atomic_swap_ptr(p, v) \ atomic_swap_int((volatile u_int *)(p), (u_int)(v)) #define atomic_readandclear_ptr(p) \ atomic_readandclear_int((volatile u_int *)(p)) #endif /* !WANT_FUNCTIONS */ #if defined(_KERNEL) #define mb() __mbk() #define wmb() __mbk() #define rmb() __mbk() #else #define mb() __mbu() #define wmb() __mbu() #define rmb() __mbu() #endif #endif /* !_MACHINE_ATOMIC_H_ */ Index: projects/bsd_rdma_4_9/sys/kern/init_main.c =================================================================== --- projects/bsd_rdma_4_9/sys/kern/init_main.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/kern/init_main.c (revision 326162) @@ -1,875 +1,875 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; EVENTHANDLER_LIST_DECLARE(process_init); EVENTHANDLER_LIST_DECLARE(thread_init); EVENTHANDLER_LIST_DECLARE(process_ctor); EVENTHANDLER_LIST_DECLARE(thread_ctor); /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; struct uidinfo tmpuinfo; struct loginclass tmplc = { .lc_name = "", }; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ /* A hack to prevent uifind from tripping over NULL pointers. */ curthread->td_ucred = newcred; tmpuinfo.ui_uid = 1; newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_loginclass = &tmplc; newcred->cr_loginclass = loginclass_find("default"); /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); EVENTHANDLER_DIRECT_INVOKE(thread_init, td); EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ - if ((error = sys_execve(td, &args)) == 0) { + if ((error = sys_execve(td, &args)) == EJUSTRETURN) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crfree(td->td_ucred); td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: projects/bsd_rdma_4_9/sys/kern/kern_exec.c =================================================================== --- projects/bsd_rdma_4_9/sys/kern/kern_exec.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/kern/kern_exec.c (revision 326162) @@ -1,1734 +1,1740 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); EVENTHANDLER_LIST_DECLARE(process_exec); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ - if (error == 0) + if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif - return (error); + /* + * We don't want cpu_set_syscall_retval() to overwrite any of + * the register values put in place by exec_setregs(). + * Implementations of cpu_set_syscall_retval() will leave + * registers unmodified when returning EJUSTRETURN. + */ + return (error == 0 ? EJUSTRETURN : error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (ma[0]->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(ma[0]); if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } vm_page_xunbusy(ma[0]); for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* An exec terminates mlockall(MCL_FUTURE). */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; if (argkva->gen != gen) { vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: projects/bsd_rdma_4_9/sys/netinet/sctp_ss_functions.c =================================================================== --- projects/bsd_rdma_4_9/sys/netinet/sctp_ss_functions.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/netinet/sctp_ss_functions.c (revision 326162) @@ -1,997 +1,999 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause + * * Copyright (c) 2010-2012, by Michael Tuexen. All rights reserved. * Copyright (c) 2010-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2010-2012, by Robin Seggelmann. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include /* * Default simple round-robin algorithm. * Just interates the streams in the order they appear. */ static void sctp_ss_default_add(struct sctp_tcb *, struct sctp_association *, struct sctp_stream_out *, struct sctp_stream_queue_pending *, int); static void sctp_ss_default_remove(struct sctp_tcb *, struct sctp_association *, struct sctp_stream_out *, struct sctp_stream_queue_pending *, int); static void sctp_ss_default_init(struct sctp_tcb *stcb, struct sctp_association *asoc, int holds_lock) { uint16_t i; asoc->ss_data.locked_on_sending = NULL; asoc->ss_data.last_out_stream = NULL; TAILQ_INIT(&asoc->ss_data.out.wheel); /* * If there is data in the stream queues already, the scheduler of * an existing association has been changed. We need to add all * stream queues to the wheel. */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, &stcb->asoc, &stcb->asoc.strmout[i], NULL, holds_lock); } return; } static void sctp_ss_default_clear(struct sctp_tcb *stcb, struct sctp_association *asoc, int clear_values SCTP_UNUSED, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } while (!TAILQ_EMPTY(&asoc->ss_data.out.wheel)) { struct sctp_stream_out *strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); TAILQ_REMOVE(&asoc->ss_data.out.wheel, TAILQ_FIRST(&asoc->ss_data.out.wheel), ss_params.rr.next_spoke); strq->ss_params.rr.next_spoke.tqe_next = NULL; strq->ss_params.rr.next_spoke.tqe_prev = NULL; } asoc->ss_data.last_out_stream = NULL; if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static void sctp_ss_default_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq) { if (with_strq != NULL) { if (stcb->asoc.ss_data.locked_on_sending == with_strq) { stcb->asoc.ss_data.locked_on_sending = strq; } if (stcb->asoc.ss_data.last_out_stream == with_strq) { stcb->asoc.ss_data.last_out_stream = strq; } } strq->ss_params.rr.next_spoke.tqe_next = NULL; strq->ss_params.rr.next_spoke.tqe_prev = NULL; return; } static void sctp_ss_default_add(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } /* Add to wheel if not already on it and stream queue not empty */ if (!TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.rr.next_spoke.tqe_next == NULL) && (strq->ss_params.rr.next_spoke.tqe_prev == NULL)) { TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke); } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static int sctp_ss_default_is_empty(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc) { if (TAILQ_EMPTY(&asoc->ss_data.out.wheel)) { return (1); } else { return (0); } } static void sctp_ss_default_remove(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } /* * Remove from wheel if stream queue is empty and actually is on the * wheel */ if (TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.rr.next_spoke.tqe_next != NULL || strq->ss_params.rr.next_spoke.tqe_prev != NULL)) { if (asoc->ss_data.last_out_stream == strq) { asoc->ss_data.last_out_stream = TAILQ_PREV(asoc->ss_data.last_out_stream, sctpwheel_listhead, ss_params.rr.next_spoke); if (asoc->ss_data.last_out_stream == NULL) { asoc->ss_data.last_out_stream = TAILQ_LAST(&asoc->ss_data.out.wheel, sctpwheel_listhead); } if (asoc->ss_data.last_out_stream == strq) { asoc->ss_data.last_out_stream = NULL; } } TAILQ_REMOVE(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke); strq->ss_params.rr.next_spoke.tqe_next = NULL; strq->ss_params.rr.next_spoke.tqe_prev = NULL; } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static struct sctp_stream_out * sctp_ss_default_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net, struct sctp_association *asoc) { struct sctp_stream_out *strq, *strqt; if (asoc->ss_data.locked_on_sending) { return (asoc->ss_data.locked_on_sending); } strqt = asoc->ss_data.last_out_stream; default_again: /* Find the next stream to use */ if (strqt == NULL) { strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); } else { strq = TAILQ_NEXT(strqt, ss_params.rr.next_spoke); if (strq == NULL) { strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); } } /* * If CMT is off, we must validate that the stream in question has * the first item pointed towards are network destination requested * by the caller. Note that if we turn out to be locked to a stream * (assigning TSN's then we must stop, since we cannot look for * another stream with data to send to that destination). In CMT's * case, by skipping this check, we will send one data packet * towards the requested net. */ if (net != NULL && strq != NULL && SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) { if (TAILQ_FIRST(&strq->outqueue) && TAILQ_FIRST(&strq->outqueue)->net != NULL && TAILQ_FIRST(&strq->outqueue)->net != net) { if (strq == asoc->ss_data.last_out_stream) { return (NULL); } else { strqt = strq; goto default_again; } } } return (strq); } static void sctp_ss_default_scheduled(struct sctp_tcb *stcb, struct sctp_nets *net SCTP_UNUSED, struct sctp_association *asoc, struct sctp_stream_out *strq, int moved_how_much SCTP_UNUSED) { struct sctp_stream_queue_pending *sp; asoc->ss_data.last_out_stream = strq; if (stcb->asoc.idata_supported == 0) { sp = TAILQ_FIRST(&strq->outqueue); if ((sp != NULL) && (sp->some_taken == 1)) { stcb->asoc.ss_data.locked_on_sending = strq; } else { stcb->asoc.ss_data.locked_on_sending = NULL; } } else { stcb->asoc.ss_data.locked_on_sending = NULL; } return; } static void sctp_ss_default_packet_done(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED) { /* Nothing to be done here */ return; } static int sctp_ss_default_get_value(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED, struct sctp_stream_out *strq SCTP_UNUSED, uint16_t *value SCTP_UNUSED) { /* Nothing to be done here */ return (-1); } static int sctp_ss_default_set_value(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED, struct sctp_stream_out *strq SCTP_UNUSED, uint16_t value SCTP_UNUSED) { /* Nothing to be done here */ return (-1); } static int sctp_ss_default_is_user_msgs_incomplete(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc) { struct sctp_stream_out *strq; struct sctp_stream_queue_pending *sp; if (asoc->stream_queue_cnt != 1) { return (0); } strq = asoc->ss_data.locked_on_sending; if (strq == NULL) { return (0); } sp = TAILQ_FIRST(&strq->outqueue); if (sp == NULL) { return (0); } return (!sp->msg_is_complete); } /* * Real round-robin algorithm. * Always interates the streams in ascending order. */ static void sctp_ss_rr_add(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { struct sctp_stream_out *strqt; if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } if (!TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.rr.next_spoke.tqe_next == NULL) && (strq->ss_params.rr.next_spoke.tqe_prev == NULL)) { if (TAILQ_EMPTY(&asoc->ss_data.out.wheel)) { TAILQ_INSERT_HEAD(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke); } else { strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel); while (strqt != NULL && (strqt->sid < strq->sid)) { strqt = TAILQ_NEXT(strqt, ss_params.rr.next_spoke); } if (strqt != NULL) { TAILQ_INSERT_BEFORE(strqt, strq, ss_params.rr.next_spoke); } else { TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke); } } } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } /* * Real round-robin per packet algorithm. * Always interates the streams in ascending order and * only fills messages of the same stream in a packet. */ static struct sctp_stream_out * sctp_ss_rrp_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net SCTP_UNUSED, struct sctp_association *asoc) { return (asoc->ss_data.last_out_stream); } static void sctp_ss_rrp_packet_done(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net, struct sctp_association *asoc) { struct sctp_stream_out *strq, *strqt; strqt = asoc->ss_data.last_out_stream; rrp_again: /* Find the next stream to use */ if (strqt == NULL) { strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); } else { strq = TAILQ_NEXT(strqt, ss_params.rr.next_spoke); if (strq == NULL) { strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); } } /* * If CMT is off, we must validate that the stream in question has * the first item pointed towards are network destination requested * by the caller. Note that if we turn out to be locked to a stream * (assigning TSN's then we must stop, since we cannot look for * another stream with data to send to that destination). In CMT's * case, by skipping this check, we will send one data packet * towards the requested net. */ if (net != NULL && strq != NULL && SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) { if (TAILQ_FIRST(&strq->outqueue) && TAILQ_FIRST(&strq->outqueue)->net != NULL && TAILQ_FIRST(&strq->outqueue)->net != net) { if (strq == asoc->ss_data.last_out_stream) { strq = NULL; } else { strqt = strq; goto rrp_again; } } } asoc->ss_data.last_out_stream = strq; return; } /* * Priority algorithm. * Always prefers streams based on their priority id. */ static void sctp_ss_prio_clear(struct sctp_tcb *stcb, struct sctp_association *asoc, int clear_values, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } while (!TAILQ_EMPTY(&asoc->ss_data.out.wheel)) { struct sctp_stream_out *strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); if (clear_values) { strq->ss_params.prio.priority = 0; } TAILQ_REMOVE(&asoc->ss_data.out.wheel, TAILQ_FIRST(&asoc->ss_data.out.wheel), ss_params.prio.next_spoke); strq->ss_params.prio.next_spoke.tqe_next = NULL; strq->ss_params.prio.next_spoke.tqe_prev = NULL; } asoc->ss_data.last_out_stream = NULL; if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static void sctp_ss_prio_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq) { if (with_strq != NULL) { if (stcb->asoc.ss_data.locked_on_sending == with_strq) { stcb->asoc.ss_data.locked_on_sending = strq; } if (stcb->asoc.ss_data.last_out_stream == with_strq) { stcb->asoc.ss_data.last_out_stream = strq; } } strq->ss_params.prio.next_spoke.tqe_next = NULL; strq->ss_params.prio.next_spoke.tqe_prev = NULL; if (with_strq != NULL) { strq->ss_params.prio.priority = with_strq->ss_params.prio.priority; } else { strq->ss_params.prio.priority = 0; } return; } static void sctp_ss_prio_add(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { struct sctp_stream_out *strqt; if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } /* Add to wheel if not already on it and stream queue not empty */ if (!TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.prio.next_spoke.tqe_next == NULL) && (strq->ss_params.prio.next_spoke.tqe_prev == NULL)) { if (TAILQ_EMPTY(&asoc->ss_data.out.wheel)) { TAILQ_INSERT_HEAD(&asoc->ss_data.out.wheel, strq, ss_params.prio.next_spoke); } else { strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel); while (strqt != NULL && strqt->ss_params.prio.priority < strq->ss_params.prio.priority) { strqt = TAILQ_NEXT(strqt, ss_params.prio.next_spoke); } if (strqt != NULL) { TAILQ_INSERT_BEFORE(strqt, strq, ss_params.prio.next_spoke); } else { TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.prio.next_spoke); } } } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static void sctp_ss_prio_remove(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } /* * Remove from wheel if stream queue is empty and actually is on the * wheel */ if (TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.prio.next_spoke.tqe_next != NULL || strq->ss_params.prio.next_spoke.tqe_prev != NULL)) { if (asoc->ss_data.last_out_stream == strq) { asoc->ss_data.last_out_stream = TAILQ_PREV(asoc->ss_data.last_out_stream, sctpwheel_listhead, ss_params.prio.next_spoke); if (asoc->ss_data.last_out_stream == NULL) { asoc->ss_data.last_out_stream = TAILQ_LAST(&asoc->ss_data.out.wheel, sctpwheel_listhead); } if (asoc->ss_data.last_out_stream == strq) { asoc->ss_data.last_out_stream = NULL; } } TAILQ_REMOVE(&asoc->ss_data.out.wheel, strq, ss_params.prio.next_spoke); strq->ss_params.prio.next_spoke.tqe_next = NULL; strq->ss_params.prio.next_spoke.tqe_prev = NULL; } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static struct sctp_stream_out * sctp_ss_prio_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net, struct sctp_association *asoc) { struct sctp_stream_out *strq, *strqt, *strqn; strqt = asoc->ss_data.last_out_stream; prio_again: /* Find the next stream to use */ if (strqt == NULL) { strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); } else { strqn = TAILQ_NEXT(strqt, ss_params.prio.next_spoke); if (strqn != NULL && strqn->ss_params.prio.priority == strqt->ss_params.prio.priority) { strq = strqn; } else { strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); } } /* * If CMT is off, we must validate that the stream in question has * the first item pointed towards are network destination requested * by the caller. Note that if we turn out to be locked to a stream * (assigning TSN's then we must stop, since we cannot look for * another stream with data to send to that destination). In CMT's * case, by skipping this check, we will send one data packet * towards the requested net. */ if (net != NULL && strq != NULL && SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) { if (TAILQ_FIRST(&strq->outqueue) && TAILQ_FIRST(&strq->outqueue)->net != NULL && TAILQ_FIRST(&strq->outqueue)->net != net) { if (strq == asoc->ss_data.last_out_stream) { return (NULL); } else { strqt = strq; goto prio_again; } } } return (strq); } static int sctp_ss_prio_get_value(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED, struct sctp_stream_out *strq, uint16_t *value) { if (strq == NULL) { return (-1); } *value = strq->ss_params.prio.priority; return (1); } static int sctp_ss_prio_set_value(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, uint16_t value) { if (strq == NULL) { return (-1); } strq->ss_params.prio.priority = value; sctp_ss_prio_remove(stcb, asoc, strq, NULL, 1); sctp_ss_prio_add(stcb, asoc, strq, NULL, 1); return (1); } /* * Fair bandwidth algorithm. * Maintains an equal troughput per stream. */ static void sctp_ss_fb_clear(struct sctp_tcb *stcb, struct sctp_association *asoc, int clear_values, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } while (!TAILQ_EMPTY(&asoc->ss_data.out.wheel)) { struct sctp_stream_out *strq = TAILQ_FIRST(&asoc->ss_data.out.wheel); if (clear_values) { strq->ss_params.fb.rounds = -1; } TAILQ_REMOVE(&asoc->ss_data.out.wheel, TAILQ_FIRST(&asoc->ss_data.out.wheel), ss_params.fb.next_spoke); strq->ss_params.fb.next_spoke.tqe_next = NULL; strq->ss_params.fb.next_spoke.tqe_prev = NULL; } asoc->ss_data.last_out_stream = NULL; if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static void sctp_ss_fb_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq) { if (with_strq != NULL) { if (stcb->asoc.ss_data.locked_on_sending == with_strq) { stcb->asoc.ss_data.locked_on_sending = strq; } if (stcb->asoc.ss_data.last_out_stream == with_strq) { stcb->asoc.ss_data.last_out_stream = strq; } } strq->ss_params.fb.next_spoke.tqe_next = NULL; strq->ss_params.fb.next_spoke.tqe_prev = NULL; if (with_strq != NULL) { strq->ss_params.fb.rounds = with_strq->ss_params.fb.rounds; } else { strq->ss_params.fb.rounds = -1; } return; } static void sctp_ss_fb_add(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } if (!TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.fb.next_spoke.tqe_next == NULL) && (strq->ss_params.fb.next_spoke.tqe_prev == NULL)) { if (strq->ss_params.fb.rounds < 0) strq->ss_params.fb.rounds = TAILQ_FIRST(&strq->outqueue)->length; TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.fb.next_spoke); } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static void sctp_ss_fb_remove(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } /* * Remove from wheel if stream queue is empty and actually is on the * wheel */ if (TAILQ_EMPTY(&strq->outqueue) && (strq->ss_params.fb.next_spoke.tqe_next != NULL || strq->ss_params.fb.next_spoke.tqe_prev != NULL)) { if (asoc->ss_data.last_out_stream == strq) { asoc->ss_data.last_out_stream = TAILQ_PREV(asoc->ss_data.last_out_stream, sctpwheel_listhead, ss_params.fb.next_spoke); if (asoc->ss_data.last_out_stream == NULL) { asoc->ss_data.last_out_stream = TAILQ_LAST(&asoc->ss_data.out.wheel, sctpwheel_listhead); } if (asoc->ss_data.last_out_stream == strq) { asoc->ss_data.last_out_stream = NULL; } } TAILQ_REMOVE(&asoc->ss_data.out.wheel, strq, ss_params.fb.next_spoke); strq->ss_params.fb.next_spoke.tqe_next = NULL; strq->ss_params.fb.next_spoke.tqe_prev = NULL; } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static struct sctp_stream_out * sctp_ss_fb_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net, struct sctp_association *asoc) { struct sctp_stream_out *strq = NULL, *strqt; if (asoc->ss_data.last_out_stream == NULL || TAILQ_FIRST(&asoc->ss_data.out.wheel) == TAILQ_LAST(&asoc->ss_data.out.wheel, sctpwheel_listhead)) { strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel); } else { strqt = TAILQ_NEXT(asoc->ss_data.last_out_stream, ss_params.fb.next_spoke); } do { if ((strqt != NULL) && ((SCTP_BASE_SYSCTL(sctp_cmt_on_off) > 0) || (SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0 && (net == NULL || (TAILQ_FIRST(&strqt->outqueue) && TAILQ_FIRST(&strqt->outqueue)->net == NULL) || (net != NULL && TAILQ_FIRST(&strqt->outqueue) && TAILQ_FIRST(&strqt->outqueue)->net != NULL && TAILQ_FIRST(&strqt->outqueue)->net == net))))) { if ((strqt->ss_params.fb.rounds >= 0) && (strq == NULL || strqt->ss_params.fb.rounds < strq->ss_params.fb.rounds)) { strq = strqt; } } if (strqt != NULL) { strqt = TAILQ_NEXT(strqt, ss_params.fb.next_spoke); } else { strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel); } } while (strqt != strq); return (strq); } static void sctp_ss_fb_scheduled(struct sctp_tcb *stcb, struct sctp_nets *net SCTP_UNUSED, struct sctp_association *asoc, struct sctp_stream_out *strq, int moved_how_much SCTP_UNUSED) { struct sctp_stream_queue_pending *sp; struct sctp_stream_out *strqt; int subtract; if (stcb->asoc.idata_supported == 0) { sp = TAILQ_FIRST(&strq->outqueue); if ((sp != NULL) && (sp->some_taken == 1)) { stcb->asoc.ss_data.locked_on_sending = strq; } else { stcb->asoc.ss_data.locked_on_sending = NULL; } } else { stcb->asoc.ss_data.locked_on_sending = NULL; } subtract = strq->ss_params.fb.rounds; TAILQ_FOREACH(strqt, &asoc->ss_data.out.wheel, ss_params.fb.next_spoke) { strqt->ss_params.fb.rounds -= subtract; if (strqt->ss_params.fb.rounds < 0) strqt->ss_params.fb.rounds = 0; } if (TAILQ_FIRST(&strq->outqueue)) { strq->ss_params.fb.rounds = TAILQ_FIRST(&strq->outqueue)->length; } else { strq->ss_params.fb.rounds = -1; } asoc->ss_data.last_out_stream = strq; return; } /* * First-come, first-serve algorithm. * Maintains the order provided by the application. */ static void sctp_ss_fcfs_add(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); static void sctp_ss_fcfs_init(struct sctp_tcb *stcb, struct sctp_association *asoc, int holds_lock) { uint32_t x, n = 0, add_more = 1; struct sctp_stream_queue_pending *sp; uint16_t i; TAILQ_INIT(&asoc->ss_data.out.list); /* * If there is data in the stream queues already, the scheduler of * an existing association has been changed. We can only cycle * through the stream queues and add everything to the FCFS queue. */ while (add_more) { add_more = 0; for (i = 0; i < stcb->asoc.streamoutcnt; i++) { sp = TAILQ_FIRST(&stcb->asoc.strmout[i].outqueue); x = 0; /* Find n. message in current stream queue */ while (sp != NULL && x < n) { sp = TAILQ_NEXT(sp, next); x++; } if (sp != NULL) { sctp_ss_fcfs_add(stcb, &stcb->asoc, &stcb->asoc.strmout[i], sp, holds_lock); add_more = 1; } } n++; } return; } static void sctp_ss_fcfs_clear(struct sctp_tcb *stcb, struct sctp_association *asoc, int clear_values, int holds_lock) { if (clear_values) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } while (!TAILQ_EMPTY(&asoc->ss_data.out.list)) { TAILQ_REMOVE(&asoc->ss_data.out.list, TAILQ_FIRST(&asoc->ss_data.out.list), ss_next); } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } } return; } static void sctp_ss_fcfs_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq) { if (with_strq != NULL) { if (stcb->asoc.ss_data.locked_on_sending == with_strq) { stcb->asoc.ss_data.locked_on_sending = strq; } if (stcb->asoc.ss_data.last_out_stream == with_strq) { stcb->asoc.ss_data.last_out_stream = strq; } } return; } static void sctp_ss_fcfs_add(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq SCTP_UNUSED, struct sctp_stream_queue_pending *sp, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } if (sp && (sp->ss_next.tqe_next == NULL) && (sp->ss_next.tqe_prev == NULL)) { TAILQ_INSERT_TAIL(&asoc->ss_data.out.list, sp, ss_next); } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static int sctp_ss_fcfs_is_empty(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc) { if (TAILQ_EMPTY(&asoc->ss_data.out.list)) { return (1); } else { return (0); } } static void sctp_ss_fcfs_remove(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq SCTP_UNUSED, struct sctp_stream_queue_pending *sp, int holds_lock) { if (holds_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } if (sp && ((sp->ss_next.tqe_next != NULL) || (sp->ss_next.tqe_prev != NULL))) { TAILQ_REMOVE(&asoc->ss_data.out.list, sp, ss_next); } if (holds_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } return; } static struct sctp_stream_out * sctp_ss_fcfs_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net, struct sctp_association *asoc) { struct sctp_stream_out *strq; struct sctp_stream_queue_pending *sp; sp = TAILQ_FIRST(&asoc->ss_data.out.list); default_again: if (sp != NULL) { strq = &asoc->strmout[sp->sid]; } else { strq = NULL; } /* * If CMT is off, we must validate that the stream in question has * the first item pointed towards are network destination requested * by the caller. Note that if we turn out to be locked to a stream * (assigning TSN's then we must stop, since we cannot look for * another stream with data to send to that destination). In CMT's * case, by skipping this check, we will send one data packet * towards the requested net. */ if (net != NULL && strq != NULL && SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) { if (TAILQ_FIRST(&strq->outqueue) && TAILQ_FIRST(&strq->outqueue)->net != NULL && TAILQ_FIRST(&strq->outqueue)->net != net) { sp = TAILQ_NEXT(sp, ss_next); goto default_again; } } return (strq); } const struct sctp_ss_functions sctp_ss_functions[] = { /* SCTP_SS_DEFAULT */ { .sctp_ss_init = sctp_ss_default_init, .sctp_ss_clear = sctp_ss_default_clear, .sctp_ss_init_stream = sctp_ss_default_init_stream, .sctp_ss_add_to_stream = sctp_ss_default_add, .sctp_ss_is_empty = sctp_ss_default_is_empty, .sctp_ss_remove_from_stream = sctp_ss_default_remove, .sctp_ss_select_stream = sctp_ss_default_select, .sctp_ss_scheduled = sctp_ss_default_scheduled, .sctp_ss_packet_done = sctp_ss_default_packet_done, .sctp_ss_get_value = sctp_ss_default_get_value, .sctp_ss_set_value = sctp_ss_default_set_value, .sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete }, /* SCTP_SS_ROUND_ROBIN */ { .sctp_ss_init = sctp_ss_default_init, .sctp_ss_clear = sctp_ss_default_clear, .sctp_ss_init_stream = sctp_ss_default_init_stream, .sctp_ss_add_to_stream = sctp_ss_rr_add, .sctp_ss_is_empty = sctp_ss_default_is_empty, .sctp_ss_remove_from_stream = sctp_ss_default_remove, .sctp_ss_select_stream = sctp_ss_default_select, .sctp_ss_scheduled = sctp_ss_default_scheduled, .sctp_ss_packet_done = sctp_ss_default_packet_done, .sctp_ss_get_value = sctp_ss_default_get_value, .sctp_ss_set_value = sctp_ss_default_set_value, .sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete }, /* SCTP_SS_ROUND_ROBIN_PACKET */ { .sctp_ss_init = sctp_ss_default_init, .sctp_ss_clear = sctp_ss_default_clear, .sctp_ss_init_stream = sctp_ss_default_init_stream, .sctp_ss_add_to_stream = sctp_ss_rr_add, .sctp_ss_is_empty = sctp_ss_default_is_empty, .sctp_ss_remove_from_stream = sctp_ss_default_remove, .sctp_ss_select_stream = sctp_ss_rrp_select, .sctp_ss_scheduled = sctp_ss_default_scheduled, .sctp_ss_packet_done = sctp_ss_rrp_packet_done, .sctp_ss_get_value = sctp_ss_default_get_value, .sctp_ss_set_value = sctp_ss_default_set_value, .sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete }, /* SCTP_SS_PRIORITY */ { .sctp_ss_init = sctp_ss_default_init, .sctp_ss_clear = sctp_ss_prio_clear, .sctp_ss_init_stream = sctp_ss_prio_init_stream, .sctp_ss_add_to_stream = sctp_ss_prio_add, .sctp_ss_is_empty = sctp_ss_default_is_empty, .sctp_ss_remove_from_stream = sctp_ss_prio_remove, .sctp_ss_select_stream = sctp_ss_prio_select, .sctp_ss_scheduled = sctp_ss_default_scheduled, .sctp_ss_packet_done = sctp_ss_default_packet_done, .sctp_ss_get_value = sctp_ss_prio_get_value, .sctp_ss_set_value = sctp_ss_prio_set_value, .sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete }, /* SCTP_SS_FAIR_BANDWITH */ { .sctp_ss_init = sctp_ss_default_init, .sctp_ss_clear = sctp_ss_fb_clear, .sctp_ss_init_stream = sctp_ss_fb_init_stream, .sctp_ss_add_to_stream = sctp_ss_fb_add, .sctp_ss_is_empty = sctp_ss_default_is_empty, .sctp_ss_remove_from_stream = sctp_ss_fb_remove, .sctp_ss_select_stream = sctp_ss_fb_select, .sctp_ss_scheduled = sctp_ss_fb_scheduled, .sctp_ss_packet_done = sctp_ss_default_packet_done, .sctp_ss_get_value = sctp_ss_default_get_value, .sctp_ss_set_value = sctp_ss_default_set_value, .sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete }, /* SCTP_SS_FIRST_COME */ { .sctp_ss_init = sctp_ss_fcfs_init, .sctp_ss_clear = sctp_ss_fcfs_clear, .sctp_ss_init_stream = sctp_ss_fcfs_init_stream, .sctp_ss_add_to_stream = sctp_ss_fcfs_add, .sctp_ss_is_empty = sctp_ss_fcfs_is_empty, .sctp_ss_remove_from_stream = sctp_ss_fcfs_remove, .sctp_ss_select_stream = sctp_ss_fcfs_select, .sctp_ss_scheduled = sctp_ss_default_scheduled, .sctp_ss_packet_done = sctp_ss_default_packet_done, .sctp_ss_get_value = sctp_ss_default_get_value, .sctp_ss_set_value = sctp_ss_default_set_value, .sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete } }; Index: projects/bsd_rdma_4_9/sys/powerpc/powerpc/exec_machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/powerpc/powerpc/exec_machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/powerpc/powerpc/exec_machdep.c (revision 326162) @@ -1,1085 +1,1072 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 2001 Benno Rice * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_fpu_emu.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FPU_EMU #include #endif #ifdef COMPAT_FREEBSD32 #include #include #include typedef struct __ucontext32 { sigset_t uc_sigmask; mcontext32_t uc_mcontext; uint32_t uc_link; struct sigaltstack32 uc_stack; uint32_t uc_flags; uint32_t __spare__[4]; } ucontext32_t; struct sigframe32 { ucontext32_t sf_uc; struct siginfo32 sf_si; }; static int grab_mcontext32(struct thread *td, mcontext32_t *, int flags); #endif static int grab_mcontext(struct thread *, mcontext_t *, int); void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigacts *psp; struct sigframe sf; struct thread *td; struct proc *p; #ifdef COMPAT_FREEBSD32 struct siginfo32 siginfo32; struct sigframe32 sf32; #endif size_t sfpsize; caddr_t sfp, usfp; int oonstack, rndfsize; int sig; int code; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; oonstack = sigonstack(tf->fixreg[1]); /* * Fill siginfo structure. */ ksi->ksi_info.si_signo = ksi->ksi_signo; ksi->ksi_info.si_addr = (void *)((tf->exc == EXC_DSI) ? tf->dar : tf->srr0); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32)) { siginfo_to_siginfo32(&ksi->ksi_info, &siginfo32); sig = siginfo32.si_signo; code = siginfo32.si_code; sfp = (caddr_t)&sf32; sfpsize = sizeof(sf32); rndfsize = roundup(sizeof(sf32), 16); /* * Save user context */ memset(&sf32, 0, sizeof(sf32)); grab_mcontext32(td, &sf32.sf_uc.uc_mcontext, 0); sf32.sf_uc.uc_sigmask = *mask; sf32.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf32.sf_uc.uc_stack.ss_size = (uint32_t)td->td_sigstk.ss_size; sf32.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf32.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; } else { #endif sig = ksi->ksi_signo; code = ksi->ksi_code; sfp = (caddr_t)&sf; sfpsize = sizeof(sf); #ifdef __powerpc64__ /* * 64-bit PPC defines a 288 byte scratch region * below the stack. */ rndfsize = 288 + roundup(sizeof(sf), 48); #else rndfsize = roundup(sizeof(sf), 16); #endif /* * Save user context */ memset(&sf, 0, sizeof(sf)); grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; #ifdef COMPAT_FREEBSD32 } #endif CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* * Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { usfp = (void *)(((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - rndfsize) & ~0xFul); } else { usfp = (void *)((tf->fixreg[1] - rndfsize) & ~0xFul); } /* * Save the floating-point state, if necessary, then copy it. */ /* XXX */ /* * Set up the registers to return to sigcode. * * r1/sp - sigframe ptr * lr - sig function, dispatched to by blrl in trampoline * r3 - sig number * r4 - SIGINFO ? &siginfo : exception code * r5 - user context * srr0 - trampoline function addr */ tf->lr = (register_t)catcher; tf->fixreg[1] = (register_t)usfp; tf->fixreg[FIRSTARG] = sig; #ifdef COMPAT_FREEBSD32 tf->fixreg[FIRSTARG+2] = (register_t)usfp + ((SV_PROC_FLAG(p, SV_ILP32)) ? offsetof(struct sigframe32, sf_uc) : offsetof(struct sigframe, sf_uc)); #else tf->fixreg[FIRSTARG+2] = (register_t)usfp + offsetof(struct sigframe, sf_uc); #endif if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* * Signal handler installed with SA_SIGINFO. */ #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32)) { sf32.sf_si = siginfo32; tf->fixreg[FIRSTARG+1] = (register_t)usfp + offsetof(struct sigframe32, sf_si); sf32.sf_si = siginfo32; } else { #endif tf->fixreg[FIRSTARG+1] = (register_t)usfp + offsetof(struct sigframe, sf_si); sf.sf_si = ksi->ksi_info; #ifdef COMPAT_FREEBSD32 } #endif } else { /* Old FreeBSD-style arguments. */ tf->fixreg[FIRSTARG+1] = code; tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? tf->dar : tf->srr0; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); tf->srr0 = (register_t)p->p_sysent->sv_sigcode_base; /* * copy the frame out to userland. */ if (copyout(sfp, usfp, sfpsize) != 0) { /* * Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); } CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->srr0, tf->fixreg[1]); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { ucontext_t uc; int error; CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x", td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_lr = tf->srr0; pcb->pcb_sp = tf->fixreg[1]; } /* * get_mcontext/sendsig helper routine that doesn't touch the * proc lock */ static int grab_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; int i; pcb = td->td_pcb; memset(mcp, 0, sizeof(mcontext_t)); mcp->mc_vers = _MC_VERSION; mcp->mc_flags = 0; memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe)); if (flags & GET_MC_CLEAR_RET) { mcp->mc_gpr[3] = 0; mcp->mc_gpr[4] = 0; } /* * This assumes that floating-point context is *not* lazy, * so if the thread has used FP there would have been a * FP-unavailable exception that would have set things up * correctly. */ if (pcb->pcb_flags & PCB_FPREGS) { if (pcb->pcb_flags & PCB_FPU) { KASSERT(td == curthread, ("get_mcontext: fp save not curthread")); critical_enter(); save_fpu(td); critical_exit(); } mcp->mc_flags |= _MC_FP_VALID; memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double)); for (i = 0; i < 32; i++) memcpy(&mcp->mc_fpreg[i], &pcb->pcb_fpu.fpr[i].fpr, sizeof(double)); } if (pcb->pcb_flags & PCB_VSX) { for (i = 0; i < 32; i++) memcpy(&mcp->mc_vsxfpreg[i], &pcb->pcb_fpu.fpr[i].vsr[2], sizeof(double)); } /* * Repeat for Altivec context */ if (pcb->pcb_flags & PCB_VEC) { KASSERT(td == curthread, ("get_mcontext: fp save not curthread")); critical_enter(); save_vec(td); critical_exit(); mcp->mc_flags |= _MC_AV_VALID; mcp->mc_vscr = pcb->pcb_vec.vscr; mcp->mc_vrsave = pcb->pcb_vec.vrsave; memcpy(mcp->mc_avec, pcb->pcb_vec.vr, sizeof(mcp->mc_avec)); } mcp->mc_len = sizeof(*mcp); return (0); } int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { int error; error = grab_mcontext(td, mcp, flags); if (error == 0) { PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]); PROC_UNLOCK(curthread->td_proc); } return (error); } int set_mcontext(struct thread *td, mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tf; register_t tls; int i; pcb = td->td_pcb; tf = td->td_frame; if (mcp->mc_vers != _MC_VERSION || mcp->mc_len != sizeof(*mcp)) return (EINVAL); /* * Don't let the user set privileged MSR bits */ if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) { return (EINVAL); } /* Copy trapframe, preserving TLS pointer across context change */ if (SV_PROC_FLAG(td->td_proc, SV_LP64)) tls = tf->fixreg[13]; else tls = tf->fixreg[2]; memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame)); if (SV_PROC_FLAG(td->td_proc, SV_LP64)) tf->fixreg[13] = tls; else tf->fixreg[2] = tls; if (mcp->mc_flags & _MC_FP_VALID) { /* enable_fpu() will happen lazily on a fault */ pcb->pcb_flags |= PCB_FPREGS; memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double)); bzero(pcb->pcb_fpu.fpr, sizeof(pcb->pcb_fpu.fpr)); for (i = 0; i < 32; i++) { memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i], sizeof(double)); memcpy(&pcb->pcb_fpu.fpr[i].vsr[2], &mcp->mc_vsxfpreg[i], sizeof(double)); } } if (mcp->mc_flags & _MC_AV_VALID) { if ((pcb->pcb_flags & PCB_VEC) != PCB_VEC) { critical_enter(); enable_vec(td); critical_exit(); } pcb->pcb_vec.vscr = mcp->mc_vscr; pcb->pcb_vec.vrsave = mcp->mc_vrsave; memcpy(pcb->pcb_vec.vr, mcp->mc_avec, sizeof(mcp->mc_avec)); } return (0); } /* * Set set up registers on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf; register_t argc; tf = trapframe(td); bzero(tf, sizeof *tf); #ifdef __powerpc64__ tf->fixreg[1] = -roundup(-stack + 48, 16); #else tf->fixreg[1] = -roundup(-stack + 8, 16); #endif /* * Set up arguments for _start(): * _start(argc, argv, envp, obj, cleanup, ps_strings); * * Notes: * - obj and cleanup are the auxilliary and termination * vectors. They are fixed up by ld.elf_so. * - ps_strings is a NetBSD extention, and will be * ignored by executables which are strictly * compliant with the SVR4 ABI. - * - * XXX We have to set both regs and retval here due to different - * XXX calling convention in trap.c and init_main.c. */ /* Collect argc from the user stack */ argc = fuword((void *)stack); - /* - * XXX PG: these get overwritten in the syscall return code. - * execve() should return EJUSTRETURN, like it does on NetBSD. - * Emulate by setting the syscall return value cells. The - * registers still have to be set for init's fork trampoline. - */ - td->td_retval[0] = argc; - td->td_retval[1] = stack + sizeof(register_t); tf->fixreg[3] = argc; tf->fixreg[4] = stack + sizeof(register_t); tf->fixreg[5] = stack + (2 + argc)*sizeof(register_t); tf->fixreg[6] = 0; /* auxillary vector */ tf->fixreg[7] = 0; /* termination vector */ tf->fixreg[8] = (register_t)imgp->ps_strings; /* NetBSD extension */ tf->srr0 = imgp->entry_addr; #ifdef __powerpc64__ tf->fixreg[12] = imgp->entry_addr; #ifdef AIM tf->srr1 = PSL_SF | PSL_USERSET | PSL_FE_DFLT; if (mfmsr() & PSL_HV) tf->srr1 |= PSL_HV; #elif defined(BOOKE) tf->srr1 = PSL_CM | PSL_USERSET | PSL_FE_DFLT; #endif #else tf->srr1 = PSL_USERSET | PSL_FE_DFLT; #endif td->td_pcb->pcb_flags = 0; } #ifdef COMPAT_FREEBSD32 void ppc32_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf; uint32_t argc; tf = trapframe(td); bzero(tf, sizeof *tf); tf->fixreg[1] = -roundup(-stack + 8, 16); argc = fuword32((void *)stack); - td->td_retval[0] = argc; - td->td_retval[1] = stack + sizeof(uint32_t); tf->fixreg[3] = argc; tf->fixreg[4] = stack + sizeof(uint32_t); tf->fixreg[5] = stack + (2 + argc)*sizeof(uint32_t); tf->fixreg[6] = 0; /* auxillary vector */ tf->fixreg[7] = 0; /* termination vector */ tf->fixreg[8] = (register_t)imgp->ps_strings; /* NetBSD extension */ tf->srr0 = imgp->entry_addr; tf->srr1 = PSL_USERSET | PSL_FE_DFLT; #ifdef AIM tf->srr1 &= ~PSL_SF; if (mfmsr() & PSL_HV) tf->srr1 |= PSL_HV; #elif defined(BOOKE) tf->srr1 &= ~PSL_CM; #endif td->td_pcb->pcb_flags = 0; } #endif int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; memcpy(regs, tf, sizeof(struct reg)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { /* No debug registers on PowerPC */ return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct pcb *pcb; int i; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_FPREGS) == 0) memset(fpregs, 0, sizeof(struct fpreg)); else { memcpy(&fpregs->fpscr, &pcb->pcb_fpu.fpscr, sizeof(double)); for (i = 0; i < 32; i++) memcpy(&fpregs->fpreg[i], &pcb->pcb_fpu.fpr[i].fpr, sizeof(double)); } return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; memcpy(tf, regs, sizeof(struct reg)); return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { /* No debug registers on PowerPC */ return (ENOSYS); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct pcb *pcb; int i; pcb = td->td_pcb; pcb->pcb_flags |= PCB_FPREGS; memcpy(&pcb->pcb_fpu.fpscr, &fpregs->fpscr, sizeof(double)); for (i = 0; i < 32; i++) { memcpy(&pcb->pcb_fpu.fpr[i].fpr, &fpregs->fpreg[i], sizeof(double)); } return (0); } #ifdef COMPAT_FREEBSD32 int set_regs32(struct thread *td, struct reg32 *regs) { struct trapframe *tf; int i; tf = td->td_frame; for (i = 0; i < 32; i++) tf->fixreg[i] = regs->fixreg[i]; tf->lr = regs->lr; tf->cr = regs->cr; tf->xer = regs->xer; tf->ctr = regs->ctr; tf->srr0 = regs->pc; return (0); } int fill_regs32(struct thread *td, struct reg32 *regs) { struct trapframe *tf; int i; tf = td->td_frame; for (i = 0; i < 32; i++) regs->fixreg[i] = tf->fixreg[i]; regs->lr = tf->lr; regs->cr = tf->cr; regs->xer = tf->xer; regs->ctr = tf->ctr; regs->pc = tf->srr0; return (0); } static int grab_mcontext32(struct thread *td, mcontext32_t *mcp, int flags) { mcontext_t mcp64; int i, error; error = grab_mcontext(td, &mcp64, flags); if (error != 0) return (error); mcp->mc_vers = mcp64.mc_vers; mcp->mc_flags = mcp64.mc_flags; mcp->mc_onstack = mcp64.mc_onstack; mcp->mc_len = mcp64.mc_len; memcpy(mcp->mc_avec,mcp64.mc_avec,sizeof(mcp64.mc_avec)); memcpy(mcp->mc_av,mcp64.mc_av,sizeof(mcp64.mc_av)); for (i = 0; i < 42; i++) mcp->mc_frame[i] = mcp64.mc_frame[i]; memcpy(mcp->mc_fpreg,mcp64.mc_fpreg,sizeof(mcp64.mc_fpreg)); memcpy(mcp->mc_vsxfpreg,mcp64.mc_vsxfpreg,sizeof(mcp64.mc_vsxfpreg)); return (0); } static int get_mcontext32(struct thread *td, mcontext32_t *mcp, int flags) { int error; error = grab_mcontext32(td, mcp, flags); if (error == 0) { PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]); PROC_UNLOCK(curthread->td_proc); } return (error); } static int set_mcontext32(struct thread *td, mcontext32_t *mcp) { mcontext_t mcp64; int i, error; mcp64.mc_vers = mcp->mc_vers; mcp64.mc_flags = mcp->mc_flags; mcp64.mc_onstack = mcp->mc_onstack; mcp64.mc_len = mcp->mc_len; memcpy(mcp64.mc_avec,mcp->mc_avec,sizeof(mcp64.mc_avec)); memcpy(mcp64.mc_av,mcp->mc_av,sizeof(mcp64.mc_av)); for (i = 0; i < 42; i++) mcp64.mc_frame[i] = mcp->mc_frame[i]; mcp64.mc_srr1 |= (td->td_frame->srr1 & 0xFFFFFFFF00000000ULL); memcpy(mcp64.mc_fpreg,mcp->mc_fpreg,sizeof(mcp64.mc_fpreg)); memcpy(mcp64.mc_vsxfpreg,mcp->mc_vsxfpreg,sizeof(mcp64.mc_vsxfpreg)); error = set_mcontext(td, &mcp64); return (error); } #endif #ifdef COMPAT_FREEBSD32 int freebsd32_sigreturn(struct thread *td, struct freebsd32_sigreturn_args *uap) { ucontext32_t uc; int error; CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } error = set_mcontext32(td, &uc.uc_mcontext); if (error != 0) return (error); kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x", td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]); return (EJUSTRETURN); } /* * The first two fields of a ucontext_t are the signal mask and the machine * context. The next field is uc_link; we want to avoid destroying the link * when copying out contexts. */ #define UC32_COPY_SIZE offsetof(ucontext32_t, uc_link) int freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap) { ucontext32_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->ucp, UC32_COPY_SIZE); } return (ret); } int freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap) { ucontext32_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE); if (ret == 0) { ret = set_mcontext32(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } return (ret == 0 ? EJUSTRETURN : ret); } int freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap) { ucontext32_t uc; int ret; if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->oucp, UC32_COPY_SIZE); if (ret == 0) { ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE); if (ret == 0) { ret = set_mcontext32(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } } return (ret == 0 ? EJUSTRETURN : ret); } #endif void cpu_set_syscall_retval(struct thread *td, int error) { struct proc *p; struct trapframe *tf; int fixup; if (error == EJUSTRETURN) return; p = td->td_proc; tf = td->td_frame; if (tf->fixreg[0] == SYS___syscall && (SV_PROC_FLAG(p, SV_ILP32))) { int code = tf->fixreg[FIRSTARG + 1]; if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; fixup = ( #if defined(COMPAT_FREEBSD6) && defined(SYS_freebsd6_lseek) code != SYS_freebsd6_lseek && #endif code != SYS_lseek) ? 1 : 0; } else fixup = 0; switch (error) { case 0: if (fixup) { /* * 64-bit return, 32-bit syscall. Fixup byte order */ tf->fixreg[FIRSTARG] = 0; tf->fixreg[FIRSTARG + 1] = td->td_retval[0]; } else { tf->fixreg[FIRSTARG] = td->td_retval[0]; tf->fixreg[FIRSTARG + 1] = td->td_retval[1]; } tf->cr &= ~0x10000000; /* Unset summary overflow */ break; case ERESTART: /* * Set user's pc back to redo the system call. */ tf->srr0 -= 4; break; default: tf->fixreg[FIRSTARG] = SV_ABI_ERRNO(p, error); tf->cr |= 0x10000000; /* Set summary overflow */ break; } } /* * Threading functions */ void cpu_thread_exit(struct thread *td) { } void cpu_thread_clean(struct thread *td) { } void cpu_thread_alloc(struct thread *td) { struct pcb *pcb; pcb = (struct pcb *)((td->td_kstack + td->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb)) & ~0x2fUL); td->td_pcb = pcb; td->td_frame = (struct trapframe *)pcb - 1; } void cpu_thread_free(struct thread *td) { } int cpu_set_user_tls(struct thread *td, void *tls_base) { if (SV_PROC_FLAG(td->td_proc, SV_LP64)) td->td_frame->fixreg[13] = (register_t)tls_base + 0x7010; else td->td_frame->fixreg[2] = (register_t)tls_base + 0x7008; return (0); } void cpu_copy_thread(struct thread *td, struct thread *td0) { struct pcb *pcb2; struct trapframe *tf; struct callframe *cf; pcb2 = td->td_pcb; /* Copy the upcall pcb */ bcopy(td0->td_pcb, pcb2, sizeof(*pcb2)); /* Create a stack for the new thread */ tf = td->td_frame; bcopy(td0->td_frame, tf, sizeof(struct trapframe)); tf->fixreg[FIRSTARG] = 0; tf->fixreg[FIRSTARG + 1] = 0; tf->cr &= ~0x10000000; /* Set registers for trampoline to user mode. */ cf = (struct callframe *)tf - 1; memset(cf, 0, sizeof(struct callframe)); cf->cf_func = (register_t)fork_return; cf->cf_arg0 = (register_t)td; cf->cf_arg1 = (register_t)tf; pcb2->pcb_sp = (register_t)cf; #if defined(__powerpc64__) && (!defined(_CALL_ELF) || _CALL_ELF == 1) pcb2->pcb_lr = ((register_t *)fork_trampoline)[0]; pcb2->pcb_toc = ((register_t *)fork_trampoline)[1]; #else pcb2->pcb_lr = (register_t)fork_trampoline; pcb2->pcb_context[0] = pcb2->pcb_lr; #endif pcb2->pcb_cpu.aim.usr_vsid = 0; /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_msr = PSL_KERNSET; } void cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, stack_t *stack) { struct trapframe *tf; uintptr_t sp; tf = td->td_frame; /* align stack and alloc space for frame ptr and saved LR */ #ifdef __powerpc64__ sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 48) & ~0x1f; #else sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 8) & ~0x1f; #endif bzero(tf, sizeof(struct trapframe)); tf->fixreg[1] = (register_t)sp; tf->fixreg[3] = (register_t)arg; if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { tf->srr0 = (register_t)entry; tf->srr1 = PSL_USERSET | PSL_FE_DFLT; #ifdef __powerpc64__ tf->srr1 &= ~PSL_SF; #endif } else { #ifdef __powerpc64__ register_t entry_desc[3]; (void)copyin((void *)entry, entry_desc, sizeof(entry_desc)); tf->srr0 = entry_desc[0]; tf->fixreg[2] = entry_desc[1]; tf->fixreg[11] = entry_desc[2]; tf->srr1 = PSL_SF | PSL_USERSET | PSL_FE_DFLT; #endif } #ifdef __powerpc64__ if (mfmsr() & PSL_HV) tf->srr1 |= PSL_HV; #endif td->td_pcb->pcb_flags = 0; td->td_retval[0] = (register_t)entry; td->td_retval[1] = 0; } int ppc_instr_emulate(struct trapframe *frame, struct pcb *pcb) { uint32_t instr; int reg, sig; instr = fuword32((void *)frame->srr0); sig = SIGILL; if ((instr & 0xfc1fffff) == 0x7c1f42a6) { /* mfpvr */ reg = (instr & ~0xfc1fffff) >> 21; frame->fixreg[reg] = mfpvr(); frame->srr0 += 4; return (0); } if ((instr & 0xfc000ffe) == 0x7c0004ac) { /* various sync */ powerpc_sync(); /* Do a heavy-weight sync */ frame->srr0 += 4; return (0); } #ifdef FPU_EMU if (!(pcb->pcb_flags & PCB_FPREGS)) { bzero(&pcb->pcb_fpu, sizeof(pcb->pcb_fpu)); pcb->pcb_flags |= PCB_FPREGS; } sig = fpu_emulate(frame, &pcb->pcb_fpu); #endif return (sig); } Index: projects/bsd_rdma_4_9/sys/riscv/riscv/machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/riscv/riscv/machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/riscv/riscv/machdep.c (revision 326162) @@ -1,894 +1,889 @@ /*- * Copyright (c) 2014 Andrew Turner * Copyright (c) 2015-2017 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_platform.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FPE #include #endif #ifdef FDT #include #include #endif struct pcpu __pcpu[MAXCPU]; static struct trapframe proc0_tf; vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2]; vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2]; int early_boot = 1; int cold = 1; long realmem = 0; long Maxmem = 0; #define DTB_SIZE_MAX (1024 * 1024) #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t physmap[PHYSMAP_SIZE]; u_int physmap_idx; struct kva_md_info kmi; int64_t dcache_line_size; /* The minimum D cache line size */ int64_t icache_line_size; /* The minimum I cache line size */ int64_t idcache_line_size; /* The minimum cache line size */ extern int *end; extern int *initstack_end; struct pcpu *pcpup; uintptr_t mcall_trap(uintptr_t mcause, uintptr_t* regs); uintptr_t mcall_trap(uintptr_t mcause, uintptr_t* regs) { return (0); } static void cpu_startup(void *dummy) { identify_cpu(); vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); } SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); int cpu_idle_wakeup(int cpu) { return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; regs->sepc = frame->tf_sepc; regs->sstatus = frame->tf_sstatus; regs->ra = frame->tf_ra; regs->sp = frame->tf_sp; regs->gp = frame->tf_gp; regs->tp = frame->tf_tp; memcpy(regs->t, frame->tf_t, sizeof(regs->t)); memcpy(regs->s, frame->tf_s, sizeof(regs->s)); memcpy(regs->a, frame->tf_a, sizeof(regs->a)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; frame->tf_sepc = regs->sepc; frame->tf_sstatus = regs->sstatus; frame->tf_ra = regs->ra; frame->tf_sp = regs->sp; frame->tf_gp = regs->gp; frame->tf_tp = regs->tp; memcpy(frame->tf_t, regs->t, sizeof(frame->tf_t)); memcpy(frame->tf_s, regs->s, sizeof(frame->tf_s)); memcpy(frame->tf_a, regs->a, sizeof(frame->tf_a)); return (0); } int fill_fpregs(struct thread *td, struct fpreg *regs) { #ifdef FPE struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) { /* * If we have just been running FPE instructions we will * need to save the state to memcpy it below. */ fpe_state_save(td); memcpy(regs->fp_x, pcb->pcb_x, sizeof(regs->fp_x)); regs->fp_fcsr = pcb->pcb_fcsr; } else #endif memset(regs->fp_x, 0, sizeof(regs->fp_x)); return (0); } int set_fpregs(struct thread *td, struct fpreg *regs) { #ifdef FPE struct pcb *pcb; pcb = td->td_pcb; memcpy(pcb->pcb_x, regs->fp_x, sizeof(regs->fp_x)); pcb->pcb_fcsr = regs->fp_fcsr; #endif return (0); } int fill_dbregs(struct thread *td, struct dbreg *regs) { panic("fill_dbregs"); } int set_dbregs(struct thread *td, struct dbreg *regs) { panic("set_dbregs"); } int ptrace_set_pc(struct thread *td, u_long addr) { panic("ptrace_set_pc"); return (0); } int ptrace_single_step(struct thread *td) { /* TODO; */ return (0); } int ptrace_clear_single_step(struct thread *td) { /* TODO; */ return (0); } void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf; struct pcb *pcb; tf = td->td_frame; pcb = td->td_pcb; memset(tf, 0, sizeof(struct trapframe)); - /* - * We need to set a0 for init as it doesn't call - * cpu_set_syscall_retval to copy the value. We also - * need to set td_retval for the cases where we do. - */ - tf->tf_a[0] = td->td_retval[0] = stack; + tf->tf_a[0] = stack; tf->tf_sp = STACKALIGN(stack); tf->tf_ra = imgp->entry_addr; tf->tf_sepc = imgp->entry_addr; pcb->pcb_fpflags &= ~PCB_FP_STARTED; } /* Sanity check these are the same size, they will be memcpy'd to and fro */ CTASSERT(sizeof(((struct trapframe *)0)->tf_a) == sizeof((struct gpregs *)0)->gp_a); CTASSERT(sizeof(((struct trapframe *)0)->tf_s) == sizeof((struct gpregs *)0)->gp_s); CTASSERT(sizeof(((struct trapframe *)0)->tf_t) == sizeof((struct gpregs *)0)->gp_t); CTASSERT(sizeof(((struct trapframe *)0)->tf_a) == sizeof((struct reg *)0)->a); CTASSERT(sizeof(((struct trapframe *)0)->tf_s) == sizeof((struct reg *)0)->s); CTASSERT(sizeof(((struct trapframe *)0)->tf_t) == sizeof((struct reg *)0)->t); /* Support for FDT configurations only. */ CTASSERT(FDT); int get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret) { struct trapframe *tf = td->td_frame; memcpy(mcp->mc_gpregs.gp_t, tf->tf_t, sizeof(mcp->mc_gpregs.gp_t)); memcpy(mcp->mc_gpregs.gp_s, tf->tf_s, sizeof(mcp->mc_gpregs.gp_s)); memcpy(mcp->mc_gpregs.gp_a, tf->tf_a, sizeof(mcp->mc_gpregs.gp_a)); if (clear_ret & GET_MC_CLEAR_RET) { mcp->mc_gpregs.gp_a[0] = 0; mcp->mc_gpregs.gp_t[0] = 0; /* clear syscall error */ } mcp->mc_gpregs.gp_ra = tf->tf_ra; mcp->mc_gpregs.gp_sp = tf->tf_sp; mcp->mc_gpregs.gp_gp = tf->tf_gp; mcp->mc_gpregs.gp_tp = tf->tf_tp; mcp->mc_gpregs.gp_sepc = tf->tf_sepc; mcp->mc_gpregs.gp_sstatus = tf->tf_sstatus; return (0); } int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tf; tf = td->td_frame; memcpy(tf->tf_t, mcp->mc_gpregs.gp_t, sizeof(tf->tf_t)); memcpy(tf->tf_s, mcp->mc_gpregs.gp_s, sizeof(tf->tf_s)); memcpy(tf->tf_a, mcp->mc_gpregs.gp_a, sizeof(tf->tf_a)); tf->tf_ra = mcp->mc_gpregs.gp_ra; tf->tf_sp = mcp->mc_gpregs.gp_sp; tf->tf_gp = mcp->mc_gpregs.gp_gp; tf->tf_tp = mcp->mc_gpregs.gp_tp; tf->tf_sepc = mcp->mc_gpregs.gp_sepc; tf->tf_sstatus = mcp->mc_gpregs.gp_sstatus; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { #ifdef FPE struct pcb *curpcb; critical_enter(); curpcb = curthread->td_pcb; KASSERT(td->td_pcb == curpcb, ("Invalid fpe pcb")); if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) { /* * If we have just been running FPE instructions we will * need to save the state to memcpy it below. */ fpe_state_save(td); KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0, ("Non-userspace FPE flags set in get_fpcontext")); memcpy(mcp->mc_fpregs.fp_x, curpcb->pcb_x, sizeof(mcp->mc_fpregs)); mcp->mc_fpregs.fp_fcsr = curpcb->pcb_fcsr; mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags; mcp->mc_flags |= _MC_FP_VALID; } critical_exit(); #endif } static void set_fpcontext(struct thread *td, mcontext_t *mcp) { #ifdef FPE struct pcb *curpcb; critical_enter(); if ((mcp->mc_flags & _MC_FP_VALID) != 0) { curpcb = curthread->td_pcb; /* FPE usage is enabled, override registers. */ memcpy(curpcb->pcb_x, mcp->mc_fpregs.fp_x, sizeof(mcp->mc_fpregs)); curpcb->pcb_fcsr = mcp->mc_fpregs.fp_fcsr; curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK; } critical_exit(); #endif } void cpu_idle(int busy) { spinlock_enter(); if (!busy) cpu_idleclock(); if (!sched_runnable()) __asm __volatile( "fence \n" "wfi \n"); if (!busy) cpu_activeclock(); spinlock_exit(); } void cpu_halt(void) { panic("cpu_halt"); } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* TBD */ } /* Get current clock frequency for the given CPU ID. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { panic("cpu_est_clockrate"); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) { td->td_md.md_spinlock_count = 1; td->td_md.md_saved_sstatus_ie = intr_disable(); } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t sstatus_ie; td = curthread; critical_exit(); sstatus_ie = td->td_md.md_saved_sstatus_ie; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(sstatus_ie); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { uint64_t sstatus; ucontext_t uc; int error; if (uap == NULL) return (EFAULT); if (copyin(uap->sigcntxp, &uc, sizeof(uc))) return (EFAULT); /* * Make sure the processor mode has not been tampered with and * interrupts have not been disabled. * Supervisor interrupts in user mode are always enabled. */ sstatus = uc.uc_mcontext.mc_gpregs.gp_sstatus; if ((sstatus & SSTATUS_SPP) != 0) return (EINVAL); error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); set_fpcontext(td, &uc.uc_mcontext); /* Restore signal mask. */ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { memcpy(pcb->pcb_t, tf->tf_t, sizeof(tf->tf_t)); memcpy(pcb->pcb_s, tf->tf_s, sizeof(tf->tf_s)); memcpy(pcb->pcb_a, tf->tf_a, sizeof(tf->tf_a)); pcb->pcb_ra = tf->tf_ra; pcb->pcb_sp = tf->tf_sp; pcb->pcb_gp = tf->tf_gp; pcb->pcb_tp = tf->tf_tp; pcb->pcb_sepc = tf->tf_sepc; } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe *fp, frame; struct sysentvec *sysent; struct trapframe *tf; struct sigacts *psp; struct thread *td; struct proc *p; int onstack; int code; int sig; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size); } else { fp = (struct sigframe *)td->td_frame->tf_sp; } /* Make room, keeping the stack aligned */ fp--; fp = (struct sigframe *)STACKALIGN(fp); /* Fill in the frame to copy out */ get_mcontext(td, &frame.sf_uc.uc_mcontext, 0); get_fpcontext(td, &frame.sf_uc.uc_mcontext); frame.sf_si = ksi->ksi_info; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE; frame.sf_uc.uc_stack = td->td_sigstk; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Copy the sigframe out to the user's stack. */ if (copyout(&frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } tf->tf_a[0] = sig; tf->tf_a[1] = (register_t)&fp->sf_si; tf->tf_a[2] = (register_t)&fp->sf_uc; tf->tf_sepc = (register_t)catcher; tf->tf_sp = (register_t)fp; sysent = p->p_sysent; if (sysent->sv_sigcode_base != 0) tf->tf_ra = (register_t)sysent->sv_sigcode_base; else tf->tf_ra = (register_t)(sysent->sv_psstrings - *(sysent->sv_szsigcode)); CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_sepc, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } static void init_proc0(vm_offset_t kstack) { pcpup = &__pcpu[0]; proc_linkup0(&proc0, &thread0); thread0.td_kstack = kstack; thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1; thread0.td_pcb->pcb_fpflags = 0; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; } static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, u_int *physmap_idxp) { u_int i, insert_idx, _physmap_idx; _physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = _physmap_idx; for (i = 0; i <= _physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= _physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } _physmap_idx += 2; *physmap_idxp = _physmap_idx; if (_physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = _physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; printf("physmap[%d] = 0x%016lx\n", insert_idx, base); printf("physmap[%d] = 0x%016lx\n", insert_idx + 1, base + length); return (1); } #ifdef FDT static void try_load_dtb(caddr_t kmdp, vm_offset_t dtbp) { #if defined(FDT_DTB_STATIC) dtbp = (vm_offset_t)&fdt_static_dtb; #endif if (dtbp == (vm_offset_t)NULL) { printf("ERROR loading DTB\n"); return; } if (OF_install(OFW_FDT, 0) == FALSE) panic("Cannot install FDT"); if (OF_init((void *)dtbp) != 0) panic("OF_init failed with the found device tree"); } #endif static void cache_setup(void) { /* TODO */ } /* * Fake up a boot descriptor table. * RISCVTODO: This needs to be done via loader (when it's available). */ vm_offset_t fake_preload_metadata(struct riscv_bootparams *rvbp __unused) { #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif vm_offset_t lastaddr; int i = 0; static uint32_t fake_preload[35]; fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("kernel") + 1; strcpy((char*)&fake_preload[i++], "kernel"); i += 1; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf64 kernel") + 1; strcpy((char*)&fake_preload[i++], "elf64 kernel"); i += 3; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = (uint64_t)(KERNBASE + KERNENTRY); i += 1; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint64_t); printf("end is 0x%016lx\n", (uint64_t)&end); fake_preload[i++] = (uint64_t)&end - (uint64_t)(KERNBASE + KERNENTRY); i += 1; #ifdef DDB #if 0 /* RISCVTODO */ if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); db_fetch_ksymtab(zstart, zend); } else #endif #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; return (lastaddr); } void initriscv(struct riscv_bootparams *rvbp) { struct mem_region mem_regions[FDT_MEM_REGIONS]; vm_offset_t rstart, rend; vm_offset_t s, e; int mem_regions_sz; vm_offset_t lastaddr; vm_size_t kernlen; caddr_t kmdp; int i; /* Set the module data location */ lastaddr = fake_preload_metadata(rvbp); /* Find the kernel address */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = RB_VERBOSE | RB_SINGLE; boothowto = RB_VERBOSE; kern_envp = NULL; #ifdef FDT try_load_dtb(kmdp, rvbp->dtbp_virt); #endif /* Load the physical memory ranges */ physmap_idx = 0; #ifdef FDT /* Grab physical memory regions information from device tree. */ if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, NULL) != 0) panic("Cannot get physical memory regions"); s = rvbp->dtbp_phys; e = s + DTB_SIZE_MAX; for (i = 0; i < mem_regions_sz; i++) { rstart = mem_regions[i].mr_start; rend = (mem_regions[i].mr_start + mem_regions[i].mr_size); if ((rstart < s) && (rend > e)) { /* Exclude DTB region. */ add_physmap_entry(rstart, (s - rstart), physmap, &physmap_idx); add_physmap_entry(e, (rend - e), physmap, &physmap_idx); } else { add_physmap_entry(mem_regions[i].mr_start, mem_regions[i].mr_size, physmap, &physmap_idx); } } #endif /* Set the pcpu data, this is needed by pmap_bootstrap */ pcpup = &__pcpu[0]; pcpu_init(pcpup, 0, sizeof(struct pcpu)); /* Set the pcpu pointer */ __asm __volatile("mv gp, %0" :: "r"(pcpup)); PCPU_SET(curthread, &thread0); /* Do basic tuning, hz etc */ init_param1(); cache_setup(); /* Bootstrap enough of pmap to enter the kernel proper */ kernlen = (lastaddr - KERNBASE); pmap_bootstrap(rvbp->kern_l1pt, mem_regions[0].mr_start, kernlen); cninit(); init_proc0(rvbp->kern_stack); /* set page table base register for thread0 */ thread0.td_pcb->pcb_l1addr = \ (rvbp->kern_l1pt - KERNBASE + rvbp->kern_phys); msgbufinit(msgbufp, msgbufsize); mutex_init(); init_param2(physmem); kdb_init(); riscv_init_interrupts(); early_boot = 0; } #undef bzero void bzero(void *buf, size_t len) { uint8_t *p; p = buf; while(len-- > 0) *p++ = 0; } Index: projects/bsd_rdma_4_9/sys/sparc64/sparc64/machdep.c =================================================================== --- projects/bsd_rdma_4_9/sys/sparc64/sparc64/machdep.c (revision 326161) +++ projects/bsd_rdma_4_9/sys/sparc64/sparc64/machdep.c (revision 326162) @@ -1,1116 +1,1113 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Jake Burkholder. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef int ofw_vec_t(void *); int dtlb_slots; int itlb_slots; struct tlb_entry *kernel_tlbs; int kernel_tlb_slots; int cold = 1; long Maxmem; long realmem; void *dpcpu0; char pcpu0[PCPU_PAGES * PAGE_SIZE]; struct pcpu dummy_pcpu[MAXCPU]; struct trapframe frame0; vm_offset_t kstack0; vm_paddr_t kstack0_phys; struct kva_md_info kmi; u_long ofw_vec; u_long ofw_tba; u_int tba_taken_over; char sparc64_model[32]; static int cpu_use_vis = 1; cpu_block_copy_t *cpu_block_copy; cpu_block_zero_t *cpu_block_zero; static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl); void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec); static void sparc64_shutdown_final(void *dummy, int howto); static void cpu_startup(void *arg); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); CTASSERT((1 << INT_SHIFT) == sizeof(int)); CTASSERT((1 << PTR_SHIFT) == sizeof(char *)); CTASSERT(sizeof(struct reg) == 256); CTASSERT(sizeof(struct fpreg) == 272); CTASSERT(sizeof(struct __mcontext) == 512); CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0); CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8)); CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2)); static void cpu_startup(void *arg) { vm_paddr_t physsz; int i; physsz = 0; for (i = 0; i < sparc64_nmemreg; i++) physsz += sparc64_memreg[i].mr_size; printf("real memory = %lu (%lu MB)\n", physsz, physsz / (1024 * 1024)); realmem = (long)physsz / PAGE_SIZE; vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL, SHUTDOWN_PRI_LAST); printf("avail memory = %lu (%lu MB)\n", vm_cnt.v_free_count * PAGE_SIZE, vm_cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE)); if (bootverbose) printf("machine: %s\n", sparc64_model); cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { struct intr_request *ir; int i; pcpu->pc_irtail = &pcpu->pc_irhead; for (i = 0; i < IR_FREE; i++) { ir = &pcpu->pc_irpool[i]; ir->ir_next = pcpu->pc_irfree; pcpu->pc_irfree = ir; } } void spinlock_enter(void) { struct thread *td; register_t pil; td = curthread; if (td->td_md.md_spinlock_count == 0) { pil = rdpr(pil); wrpr(pil, 0, PIL_TICK); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_pil = pil; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t pil; td = curthread; critical_exit(); pil = td->td_md.md_saved_pil; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) wrpr(pil, pil, 0); } static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl) { char type[sizeof("cpu")]; phandle_t child; uint32_t portid; for (; node != 0; node = OF_peer(node)) { child = OF_child(node); if (child > 0) { child = find_bsp(child, bspid, cpu_impl); if (child > 0) return (child); } else { if (OF_getprop(node, "device_type", type, sizeof(type)) <= 0) continue; if (strcmp(type, "cpu") != 0) continue; if (OF_getprop(node, cpu_portid_prop(cpu_impl), &portid, sizeof(portid)) <= 0) continue; if (portid == bspid) return (node); } } return (0); } const char * cpu_portid_prop(u_int cpu_impl) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_SPARC64V: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: return ("upa-portid"); case CPU_IMPL_ULTRASPARCIII: case CPU_IMPL_ULTRASPARCIIIp: case CPU_IMPL_ULTRASPARCIIIi: case CPU_IMPL_ULTRASPARCIIIip: return ("portid"); case CPU_IMPL_ULTRASPARCIV: case CPU_IMPL_ULTRASPARCIVp: return ("cpuid"); default: return (""); } } uint32_t cpu_get_mid(u_int cpu_impl) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_SPARC64V: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG))); case CPU_IMPL_ULTRASPARCIII: case CPU_IMPL_ULTRASPARCIIIp: return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG, ASI_FIREPLANE_CONFIG_REG))); case CPU_IMPL_ULTRASPARCIIIi: case CPU_IMPL_ULTRASPARCIIIip: return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG))); case CPU_IMPL_ULTRASPARCIV: case CPU_IMPL_ULTRASPARCIVp: return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID))); default: return (0); } } void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec) { char *env; struct pcpu *pc; vm_offset_t end; vm_offset_t va; caddr_t kmdp; phandle_t root; u_int cpu_impl; end = 0; kmdp = NULL; /* * Find out what kind of CPU we have first, for anything that changes * behaviour. */ cpu_impl = VER_IMPL(rdpr(ver)); /* * Do CPU-specific initialization. */ if (cpu_impl >= CPU_IMPL_ULTRASPARCIII) cheetah_init(cpu_impl); else if (cpu_impl == CPU_IMPL_SPARC64V) zeus_init(cpu_impl); /* * Clear (S)TICK timer (including NPT). */ tick_clear(cpu_impl); /* * UltraSparc II[e,i] based systems come up with the tick interrupt * enabled and a handler that resets the tick counter, causing DELAY() * to not work properly when used early in boot. * UltraSPARC III based systems come up with the system tick interrupt * enabled, causing an interrupt storm on startup since they are not * handled. */ tick_stop(cpu_impl); /* * Set up Open Firmware entry points. */ ofw_tba = rdpr(tba); ofw_vec = (u_long)vec; /* * Parse metadata if present and fetch parameters. Must be before the * console is inited so cninit() gets the right value of boothowto. */ if (mdp != NULL) { preload_metadata = mdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS, int); kernel_tlbs = (void *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_DTLB); } } init_param1(); /* * Initialize Open Firmware (needed for console). */ OF_install(OFW_STD_DIRECT, 0); OF_init(ofw_entry); /* * Prime our per-CPU data page for use. Note, we are using it for * our stack, so don't pass the real size (PAGE_SIZE) to pcpu_init * or it'll zero it out from under us. */ pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1; pcpu_init(pc, 0, sizeof(struct pcpu)); pc->pc_addr = (vm_offset_t)pcpu0; pc->pc_impl = cpu_impl; pc->pc_mid = cpu_get_mid(cpu_impl); pc->pc_tlb_ctx = TLB_CTX_USER_MIN; pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN; pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX; /* * Determine the OFW node and frequency of the BSP (and ensure the * BSP is in the device tree in the first place). */ root = OF_peer(0); pc->pc_node = find_bsp(root, pc->pc_mid, cpu_impl); if (pc->pc_node == 0) OF_panic("%s: cannot find boot CPU node", __func__); if (OF_getprop(pc->pc_node, "clock-frequency", &pc->pc_clock, sizeof(pc->pc_clock)) <= 0) OF_panic("%s: cannot determine boot CPU clock", __func__); /* * Panic if there is no metadata. Most likely the kernel was booted * directly, instead of through loader(8). */ if (mdp == NULL || kmdp == NULL || end == 0 || kernel_tlb_slots == 0 || kernel_tlbs == NULL) OF_panic("%s: missing loader metadata.\nThis probably means " "you are not using loader(8).", __func__); /* * Work around the broken loader behavior of not demapping no * longer used kernel TLB slots when unloading the kernel or * modules. */ for (va = KERNBASE + (kernel_tlb_slots - 1) * PAGE_SIZE_4M; va >= roundup2(end, PAGE_SIZE_4M); va -= PAGE_SIZE_4M) { if (bootverbose) OF_printf("demapping unused kernel TLB slot " "(va %#lx - %#lx)\n", va, va + PAGE_SIZE_4M - 1); stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE, ASI_DMMU_DEMAP, 0); stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE, ASI_IMMU_DEMAP, 0); flush(KERNBASE); kernel_tlb_slots--; } /* * Determine the TLB slot maxima, which are expected to be * equal across all CPUs. * NB: for cheetah-class CPUs, these properties only refer * to the t16s. */ if (OF_getprop(pc->pc_node, "#dtlb-entries", &dtlb_slots, sizeof(dtlb_slots)) == -1) OF_panic("%s: cannot determine number of dTLB slots", __func__); if (OF_getprop(pc->pc_node, "#itlb-entries", &itlb_slots, sizeof(itlb_slots)) == -1) OF_panic("%s: cannot determine number of iTLB slots", __func__); /* * Initialize and enable the caches. Note that this may include * applying workarounds. */ cache_init(pc); cache_enable(cpu_impl); uma_set_align(pc->pc_cache.dc_linesize - 1); cpu_block_copy = bcopy; cpu_block_zero = bzero; getenv_int("machdep.use_vis", &cpu_use_vis); if (cpu_use_vis) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: case CPU_IMPL_ULTRASPARCIII: /* NB: we've disabled P$. */ case CPU_IMPL_ULTRASPARCIIIp: case CPU_IMPL_ULTRASPARCIIIi: case CPU_IMPL_ULTRASPARCIV: case CPU_IMPL_ULTRASPARCIVp: case CPU_IMPL_ULTRASPARCIIIip: cpu_block_copy = spitfire_block_copy; cpu_block_zero = spitfire_block_zero; break; case CPU_IMPL_SPARC64V: cpu_block_copy = zeus_block_copy; cpu_block_zero = zeus_block_zero; break; } } #ifdef SMP mp_init(); #endif /* * Initialize virtual memory and calculate physmem. */ pmap_bootstrap(cpu_impl); /* * Initialize tunables. */ init_param2(physmem); env = kern_getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Initialize the interrupt tables. */ intr_init1(); /* * Initialize proc0, set kstack0, frame0, curthread and curpcb. */ proc_linkup0(&proc0, &thread0); proc0.p_md.md_sigtramp = NULL; proc0.p_md.md_utrap = NULL; thread0.td_kstack = kstack0; thread0.td_kstack_pages = KSTACK_PAGES; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV; thread0.td_frame = &frame0; pc->pc_curthread = &thread0; pc->pc_curpcb = thread0.td_pcb; /* * Initialize global registers. */ cpu_setregs(pc); /* * Take over the trap table via the PROM. Using the PROM for this * is necessary in order to set obp-control-relinquished to true * within the PROM so obtaining /virtual-memory/translations doesn't * trigger a fatal reset error or worse things further down the road. * XXX it should be possible to use this solely instead of writing * %tba in cpu_setregs(). Doing so causes a hang however. * * NB: the low-level console drivers require a working DELAY() and * some compiler optimizations may cause the curthread accesses of * mutex(9) to be factored out even if the latter aren't actually * called. Both of these require PCPU_REG to be set. However, we * can't set PCPU_REG without also taking over the trap table or the * firmware will overwrite it. */ sun4u_set_traptable(tl0_base); /* * Initialize the dynamic per-CPU area for the BSP and the message * buffer (after setting the trap table). */ dpcpu_init(dpcpu0, 0); msgbufinit(msgbufp, msgbufsize); /* * Initialize mutexes. */ mutex_init(); /* * Initialize console now that we have a reasonable set of system * services. */ cninit(); /* * Finish the interrupt initialization now that mutexes work and * enable them. */ intr_init2(); wrpr(pil, 0, 0); wrpr(pstate, 0, PSTATE_KERNEL); OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigframe *sfp; struct sigacts *psp; struct sigframe sf; struct thread *td; struct frame *fp; struct proc *p; u_long sp; int oonstack; int sig; oonstack = 0; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; sp = tf->tf_sp + SPOFF; oonstack = sigonstack(sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Make sure we have a signal trampoline to return to. */ if (p->p_md.md_sigtramp == NULL) { /* * No signal trampoline... kill the process. */ CTR0(KTR_SIG, "sendsig: no sigtramp"); printf("sendsig: %s is too old, rebuild it\n", p->p_comm); sigexit(td, sig); /* NOTREACHED */ } /* Save user context. */ bzero(&sf, sizeof(sf)); get_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe)); } else sfp = (struct sigframe *)sp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); fp = (struct frame *)sfp - 1; /* Build the argument list for the signal handler. */ tf->tf_out[0] = sig; tf->tf_out[2] = (register_t)&sfp->sf_uc; tf->tf_out[4] = (register_t)catcher; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ tf->tf_out[1] = (register_t)&sfp->sf_si; /* Fill in POSIX parts. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ tf->tf_out[1] = ksi->ksi_code; tf->tf_out[3] = (register_t)ksi->ksi_addr; } /* Copy the sigframe out to the user's stack. */ if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 || suword(&fp->fr_in[6], tf->tf_out[6]) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); /* NOTREACHED */ } tf->tf_tpc = (u_long)p->p_md.md_sigtramp; tf->tf_tnpc = tf->tf_tpc + 4; tf->tf_sp = (u_long)fp - SPOFF; CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif /* * MPSAFE */ int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { struct proc *p; mcontext_t *mc; ucontext_t uc; int error; p = td->td_proc; if (rwindow_save(td)) { PROC_LOCK(p); sigexit(td, SIGILL); } CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } mc = &uc.uc_mcontext; error = set_mcontext(td, mc); if (error != 0) return (error); kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx", td, mc->_mc_tpc, mc->_mc_sp, mc->_mc_tstate); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_pc = tf->tf_tpc; pcb->pcb_sp = tf->tf_sp; } int get_mcontext(struct thread *td, mcontext_t *mc, int flags) { struct trapframe *tf; struct pcb *pcb; tf = td->td_frame; pcb = td->td_pcb; /* * Copy the registers which will be restored by tl0_ret() from the * trapframe. * Note that we skip %g7 which is used as the userland TLS register * and %wstate. */ mc->_mc_flags = _MC_VERSION; mc->mc_global[1] = tf->tf_global[1]; mc->mc_global[2] = tf->tf_global[2]; mc->mc_global[3] = tf->tf_global[3]; mc->mc_global[4] = tf->tf_global[4]; mc->mc_global[5] = tf->tf_global[5]; mc->mc_global[6] = tf->tf_global[6]; if (flags & GET_MC_CLEAR_RET) { mc->mc_out[0] = 0; mc->mc_out[1] = 0; } else { mc->mc_out[0] = tf->tf_out[0]; mc->mc_out[1] = tf->tf_out[1]; } mc->mc_out[2] = tf->tf_out[2]; mc->mc_out[3] = tf->tf_out[3]; mc->mc_out[4] = tf->tf_out[4]; mc->mc_out[5] = tf->tf_out[5]; mc->mc_out[6] = tf->tf_out[6]; mc->mc_out[7] = tf->tf_out[7]; mc->_mc_fprs = tf->tf_fprs; mc->_mc_fsr = tf->tf_fsr; mc->_mc_gsr = tf->tf_gsr; mc->_mc_tnpc = tf->tf_tnpc; mc->_mc_tpc = tf->tf_tpc; mc->_mc_tstate = tf->tf_tstate; mc->_mc_y = tf->tf_y; critical_enter(); if ((tf->tf_fprs & FPRS_FEF) != 0) { savefpctx(pcb->pcb_ufp); tf->tf_fprs &= ~FPRS_FEF; pcb->pcb_flags |= PCB_FEF; } if ((pcb->pcb_flags & PCB_FEF) != 0) { bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp)); mc->_mc_fprs |= FPRS_FEF; } critical_exit(); return (0); } int set_mcontext(struct thread *td, mcontext_t *mc) { struct trapframe *tf; struct pcb *pcb; if (!TSTATE_SECURE(mc->_mc_tstate) || (mc->_mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION) return (EINVAL); tf = td->td_frame; pcb = td->td_pcb; /* Make sure the windows are spilled first. */ flushw(); /* * Copy the registers which will be restored by tl0_ret() to the * trapframe. * Note that we skip %g7 which is used as the userland TLS register * and %wstate. */ tf->tf_global[1] = mc->mc_global[1]; tf->tf_global[2] = mc->mc_global[2]; tf->tf_global[3] = mc->mc_global[3]; tf->tf_global[4] = mc->mc_global[4]; tf->tf_global[5] = mc->mc_global[5]; tf->tf_global[6] = mc->mc_global[6]; tf->tf_out[0] = mc->mc_out[0]; tf->tf_out[1] = mc->mc_out[1]; tf->tf_out[2] = mc->mc_out[2]; tf->tf_out[3] = mc->mc_out[3]; tf->tf_out[4] = mc->mc_out[4]; tf->tf_out[5] = mc->mc_out[5]; tf->tf_out[6] = mc->mc_out[6]; tf->tf_out[7] = mc->mc_out[7]; tf->tf_fprs = mc->_mc_fprs; tf->tf_fsr = mc->_mc_fsr; tf->tf_gsr = mc->_mc_gsr; tf->tf_tnpc = mc->_mc_tnpc; tf->tf_tpc = mc->_mc_tpc; tf->tf_tstate = mc->_mc_tstate; tf->tf_y = mc->_mc_y; if ((mc->_mc_fprs & FPRS_FEF) != 0) { tf->tf_fprs = 0; bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); pcb->pcb_flags |= PCB_FEF; } return (0); } /* * Exit the kernel and execute a firmware call that will not return, as * specified by the arguments. */ void cpu_shutdown(void *args) { #ifdef SMP cpu_mp_shutdown(); #endif ofw_exit(args); } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* TBD */ } /* Get current clock frequency for the given CPU ID. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { struct pcpu *pc; pc = pcpu_find(cpu_id); if (pc == NULL || rate == NULL) return (EINVAL); *rate = pc->pc_clock; return (0); } /* * Duplicate OF_exit() with a different firmware call function that restores * the trap table, otherwise a RED state exception is triggered in at least * some firmware versions. */ void cpu_halt(void) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"exit", 0, 0 }; cpu_shutdown(&args); } static void sparc64_shutdown_final(void *dummy, int howto) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"SUNW,power-off", 0, 0 }; /* Turn the power off? */ if ((howto & RB_POWEROFF) != 0) cpu_shutdown(&args); /* In case of halt, return to the firmware. */ if ((howto & RB_HALT) != 0) cpu_halt(); } void cpu_idle(int busy) { /* Insert code to halt (until next interrupt) for the idle loop. */ } int cpu_idle_wakeup(int cpu) { return (1); } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_tpc = addr; td->td_frame->tf_tnpc = addr + 4; return (0); } int ptrace_single_step(struct thread *td) { /* TODO; */ return (0); } int ptrace_clear_single_step(struct thread *td) { /* TODO; */ return (0); } void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf; struct pcb *pcb; struct proc *p; u_long sp; /* XXX no cpu_exec */ p = td->td_proc; p->p_md.md_sigtramp = NULL; if (p->p_md.md_utrap != NULL) { utrap_free(p->p_md.md_utrap); p->p_md.md_utrap = NULL; } pcb = td->td_pcb; tf = td->td_frame; sp = rounddown(stack, 16); bzero(pcb, sizeof(*pcb)); bzero(tf, sizeof(*tf)); tf->tf_out[0] = stack; tf->tf_out[3] = p->p_sysent->sv_psstrings; tf->tf_out[6] = sp - SPOFF - sizeof(struct frame); tf->tf_tnpc = imgp->entry_addr + 4; tf->tf_tpc = imgp->entry_addr; /* * While we could adhere to the memory model indicated in the ELF * header, it turns out that just always using TSO performs best. */ tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO; - - td->td_retval[0] = tf->tf_out[0]; - td->td_retval[1] = tf->tf_out[1]; } int fill_regs(struct thread *td, struct reg *regs) { bcopy(td->td_frame, regs, sizeof(*regs)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; if (!TSTATE_SECURE(regs->r_tstate)) return (EINVAL); tf = td->td_frame; regs->r_wstate = tf->tf_wstate; bcopy(regs, tf, sizeof(*regs)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs)); fpregs->fr_fsr = tf->tf_fsr; fpregs->fr_gsr = tf->tf_gsr; return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; tf->tf_fprs &= ~FPRS_FEF; bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); tf->tf_fsr = fpregs->fr_fsr; tf->tf_gsr = fpregs->fr_gsr; return (0); } struct md_utrap * utrap_alloc(void) { struct md_utrap *ut; ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO); ut->ut_refcnt = 1; return (ut); } void utrap_free(struct md_utrap *ut) { int refcnt; if (ut == NULL) return; mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt--; refcnt = ut->ut_refcnt; mtx_pool_unlock(mtxpool_sleep, ut); if (refcnt == 0) free(ut, M_SUBPROC); } struct md_utrap * utrap_hold(struct md_utrap *ut) { if (ut == NULL) return (NULL); mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt++; mtx_pool_unlock(mtxpool_sleep, ut); return (ut); } Index: projects/bsd_rdma_4_9/usr.bin/vmstat/vmstat.c =================================================================== --- projects/bsd_rdma_4_9/usr.bin/vmstat/vmstat.c (revision 326161) +++ projects/bsd_rdma_4_9/usr.bin/vmstat/vmstat.c (revision 326162) @@ -1,1738 +1,1738 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1986, 1991, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #if 0 #ifndef lint static char sccsid[] = "@(#)vmstat.c 8.1 (Berkeley) 6/6/93"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define _WANT_VMMETER #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VMSTAT_XO_VERSION "1" static char da[] = "da"; static struct nlist namelist[] = { #define X_SUM 0 { "_vm_cnt" }, #define X_HZ 1 { "_hz" }, #define X_STATHZ 2 { "_stathz" }, #define X_NCHSTATS 3 { "_nchstats" }, #define X_INTRNAMES 4 { "_intrnames" }, #define X_SINTRNAMES 5 { "_sintrnames" }, #define X_INTRCNT 6 { "_intrcnt" }, #define X_SINTRCNT 7 { "_sintrcnt" }, #ifdef notyet #define X_DEFICIT XXX { "_deficit" }, #define X_REC XXX { "_rectime" }, #define X_PGIN XXX { "_pgintime" }, #define X_XSTATS XXX { "_xstats" }, #define X_END XXX #else #define X_END 8 #endif { "" }, }; static struct statinfo cur, last; static int num_devices, maxshowdevs; static long generation; static struct device_selection *dev_select; static int num_selected; static struct devstat_match *matches; static int num_matches = 0; static int num_devices_specified, num_selections; static long select_generation; static char **specified_devices; static devstat_select_mode select_mode; static struct __vmmeter { uint64_t v_swtch; uint64_t v_trap; uint64_t v_syscall; uint64_t v_intr; uint64_t v_soft; uint64_t v_vm_faults; uint64_t v_io_faults; uint64_t v_cow_faults; uint64_t v_cow_optim; uint64_t v_zfod; uint64_t v_ozfod; uint64_t v_swapin; uint64_t v_swapout; uint64_t v_swappgsin; uint64_t v_swappgsout; uint64_t v_vnodein; uint64_t v_vnodeout; uint64_t v_vnodepgsin; uint64_t v_vnodepgsout; uint64_t v_intrans; uint64_t v_reactivated; uint64_t v_pdwakeups; uint64_t v_pdpages; uint64_t v_pdshortfalls; uint64_t v_dfree; uint64_t v_pfree; uint64_t v_tfree; uint64_t v_forks; uint64_t v_vforks; uint64_t v_rforks; uint64_t v_kthreads; uint64_t v_forkpages; uint64_t v_vforkpages; uint64_t v_rforkpages; uint64_t v_kthreadpages; u_int v_page_size; u_int v_page_count; u_int v_free_reserved; u_int v_free_target; u_int v_free_min; u_int v_free_count; u_int v_wire_count; u_int v_active_count; u_int v_inactive_target; u_int v_inactive_count; u_int v_laundry_count; u_int v_pageout_free_min; u_int v_interrupt_free_min; u_int v_free_severe; } sum, osum; #define VMSTAT_DEFAULT_LINES 20 /* Default number of `winlines'. */ volatile sig_atomic_t wresized; /* Tty resized, when non-zero. */ static int winlines = VMSTAT_DEFAULT_LINES; /* Current number of tty rows. */ static int aflag; static int nflag; static int Pflag; static int hflag; static kvm_t *kd; #define FORKSTAT 0x01 #define INTRSTAT 0x02 #define MEMSTAT 0x04 #define SUMSTAT 0x08 #define TIMESTAT 0x10 #define VMSTAT 0x20 #define ZMEMSTAT 0x40 #define OBJSTAT 0x80 static void cpustats(void); static void pcpustats(int, u_long, int); static void devstats(void); static void doforkst(void); static void dointr(unsigned int, int); static void doobjstat(void); static void dosum(void); static void dovmstat(unsigned int, int); static void domemstat_malloc(void); static void domemstat_zone(void); static void kread(int, void *, size_t); static void kreado(int, void *, size_t, size_t); static char *kgetstr(const char *); static void needhdr(int); static void needresize(int); static void doresize(void); static void printhdr(int, u_long); static void usage(void); static long pct(long, long); static long long getuptime(void); static char **getdrivedata(char **); int main(int argc, char *argv[]) { int c, todo; unsigned int interval; float f; int reps; char *memf, *nlistf; char errbuf[_POSIX2_LINE_MAX]; memf = nlistf = NULL; interval = reps = todo = 0; maxshowdevs = 2; hflag = isatty(1); argc = xo_parse_args(argc, argv); if (argc < 0) return argc; while ((c = getopt(argc, argv, "ac:fhHiM:mN:n:oPp:stw:z")) != -1) { switch (c) { case 'a': aflag++; break; case 'c': reps = atoi(optarg); break; case 'P': Pflag++; break; case 'f': todo |= FORKSTAT; break; case 'h': hflag = 1; break; case 'H': hflag = 0; break; case 'i': todo |= INTRSTAT; break; case 'M': memf = optarg; break; case 'm': todo |= MEMSTAT; break; case 'N': nlistf = optarg; break; case 'n': nflag = 1; maxshowdevs = atoi(optarg); if (maxshowdevs < 0) xo_errx(1, "number of devices %d is < 0", maxshowdevs); break; case 'o': todo |= OBJSTAT; break; case 'p': if (devstat_buildmatch(optarg, &matches, &num_matches) != 0) xo_errx(1, "%s", devstat_errbuf); break; case 's': todo |= SUMSTAT; break; case 't': #ifdef notyet todo |= TIMESTAT; #else xo_errx(EX_USAGE, "sorry, -t is not (re)implemented yet"); #endif break; case 'w': /* Convert to milliseconds. */ f = atof(optarg); interval = f * 1000; break; case 'z': todo |= ZMEMSTAT; break; case '?': default: usage(); } } argc -= optind; argv += optind; xo_set_version(VMSTAT_XO_VERSION); if (todo == 0) todo = VMSTAT; if (memf != NULL) { kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf); if (kd == NULL) xo_errx(1, "kvm_openfiles: %s", errbuf); } retry_nlist: if (kd != NULL && (c = kvm_nlist(kd, namelist)) != 0) { if (c > 0) { int bufsize = 0, len = 0; char *buf, *bp; /* * 'cnt' was renamed to 'vm_cnt'. If 'vm_cnt' is not * found try looking up older 'cnt' symbol. * */ if (namelist[X_SUM].n_type == 0 && strcmp(namelist[X_SUM].n_name, "_vm_cnt") == 0) { namelist[X_SUM].n_name = "_cnt"; goto retry_nlist; } for (c = 0; c < (int)(nitems(namelist)); c++) if (namelist[c].n_type == 0) bufsize += strlen(namelist[c].n_name) + 1; bufsize += len + 1; buf = bp = alloca(bufsize); for (c = 0; c < (int)(nitems(namelist)); c++) if (namelist[c].n_type == 0) { xo_error(" %s", namelist[c].n_name); len = strlen(namelist[c].n_name); *bp++ = ' '; memcpy(bp, namelist[c].n_name, len); bp += len; } *bp = '\0'; xo_error("undefined symbols:\n", buf); } else xo_warnx("kvm_nlist: %s", kvm_geterr(kd)); xo_finish(); exit(1); } if (kd && Pflag) xo_errx(1, "Cannot use -P with crash dumps"); if (todo & VMSTAT) { /* * Make sure that the userland devstat version matches the * kernel devstat version. If not, exit and print a * message informing the user of his mistake. */ if (devstat_checkversion(NULL) < 0) xo_errx(1, "%s", devstat_errbuf); argv = getdrivedata(argv); } if (*argv) { f = atof(*argv); interval = f * 1000; if (*++argv) reps = atoi(*argv); } if (interval) { if (!reps) reps = -1; } else if (reps) interval = 1 * 1000; if (todo & FORKSTAT) doforkst(); if (todo & MEMSTAT) domemstat_malloc(); if (todo & ZMEMSTAT) domemstat_zone(); if (todo & SUMSTAT) dosum(); if (todo & OBJSTAT) doobjstat(); #ifdef notyet if (todo & TIMESTAT) dotimes(); #endif if (todo & INTRSTAT) dointr(interval, reps); if (todo & VMSTAT) dovmstat(interval, reps); xo_finish(); exit(0); } static int mysysctl(const char *name, void *oldp, size_t *oldlenp) { int error; error = sysctlbyname(name, oldp, oldlenp, NULL, 0); if (error != 0 && errno != ENOMEM) xo_err(1, "sysctl(%s)", name); return (error); } static char ** getdrivedata(char **argv) { if ((num_devices = devstat_getnumdevs(NULL)) < 0) xo_errx(1, "%s", devstat_errbuf); cur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo)); last.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo)); if (devstat_getdevs(NULL, &cur) == -1) xo_errx(1, "%s", devstat_errbuf); num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; specified_devices = (char **)malloc(sizeof(char *)); for (num_devices_specified = 0; *argv; ++argv) { if (isdigit(**argv)) break; num_devices_specified++; specified_devices = (char **)realloc(specified_devices, sizeof(char *) * num_devices_specified); specified_devices[num_devices_specified - 1] = *argv; } dev_select = NULL; if (nflag == 0 && maxshowdevs < num_devices_specified) maxshowdevs = num_devices_specified; /* * People are generally only interested in disk statistics when * they're running vmstat. So, that's what we're going to give * them if they don't specify anything by default. We'll also give * them any other random devices in the system so that we get to * maxshowdevs devices, if that many devices exist. If the user * specifies devices on the command line, either through a pattern * match or by naming them explicitly, we will give the user only * those devices. */ if ((num_devices_specified == 0) && (num_matches == 0)) { if (devstat_buildmatch(da, &matches, &num_matches) != 0) xo_errx(1, "%s", devstat_errbuf); select_mode = DS_SELECT_ADD; } else select_mode = DS_SELECT_ONLY; /* * At this point, selectdevs will almost surely indicate that the * device list has changed, so we don't look for return values of 0 * or 1. If we get back -1, though, there is an error. */ if (devstat_selectdevs(&dev_select, &num_selected, &num_selections, &select_generation, generation, cur.dinfo->devices, num_devices, matches, num_matches, specified_devices, num_devices_specified, select_mode, maxshowdevs, 0) == -1) xo_errx(1, "%s", devstat_errbuf); return(argv); } /* Return system uptime in nanoseconds */ static long long getuptime(void) { struct timespec sp; (void)clock_gettime(CLOCK_UPTIME, &sp); return((long long)sp.tv_sec * 1000000000LL + sp.tv_nsec); } static void fill_vmmeter(struct __vmmeter *vmmp) { struct pcpu **pcpu; int maxcpu, i; if (kd != NULL) { struct vmmeter vm_cnt; kread(X_SUM, &vm_cnt, sizeof(vm_cnt)); #define GET_COUNTER(name) \ vmmp->name = kvm_counter_u64_fetch(kd, (u_long)vm_cnt.name) GET_COUNTER(v_swtch); GET_COUNTER(v_trap); GET_COUNTER(v_syscall); GET_COUNTER(v_intr); GET_COUNTER(v_soft); GET_COUNTER(v_vm_faults); GET_COUNTER(v_io_faults); GET_COUNTER(v_cow_faults); GET_COUNTER(v_cow_optim); GET_COUNTER(v_zfod); GET_COUNTER(v_ozfod); GET_COUNTER(v_swapin); GET_COUNTER(v_swapout); GET_COUNTER(v_swappgsin); GET_COUNTER(v_swappgsout); GET_COUNTER(v_vnodein); GET_COUNTER(v_vnodeout); GET_COUNTER(v_vnodepgsin); GET_COUNTER(v_vnodepgsout); GET_COUNTER(v_intrans); GET_COUNTER(v_tfree); GET_COUNTER(v_forks); GET_COUNTER(v_vforks); GET_COUNTER(v_rforks); GET_COUNTER(v_kthreads); GET_COUNTER(v_forkpages); GET_COUNTER(v_vforkpages); GET_COUNTER(v_rforkpages); GET_COUNTER(v_kthreadpages); #undef GET_COUNTER } else { size_t size; #define GET_VM_STATS(cat, name) do { \ size = sizeof(vmmp->name); \ mysysctl("vm.stats." #cat "." #name, &vmmp->name, &size); \ } while (0) /* sys */ GET_VM_STATS(sys, v_swtch); GET_VM_STATS(sys, v_trap); GET_VM_STATS(sys, v_syscall); GET_VM_STATS(sys, v_intr); GET_VM_STATS(sys, v_soft); /* vm */ GET_VM_STATS(vm, v_vm_faults); GET_VM_STATS(vm, v_io_faults); GET_VM_STATS(vm, v_cow_faults); GET_VM_STATS(vm, v_cow_optim); GET_VM_STATS(vm, v_zfod); GET_VM_STATS(vm, v_ozfod); GET_VM_STATS(vm, v_swapin); GET_VM_STATS(vm, v_swapout); GET_VM_STATS(vm, v_swappgsin); GET_VM_STATS(vm, v_swappgsout); GET_VM_STATS(vm, v_vnodein); GET_VM_STATS(vm, v_vnodeout); GET_VM_STATS(vm, v_vnodepgsin); GET_VM_STATS(vm, v_vnodepgsout); GET_VM_STATS(vm, v_intrans); GET_VM_STATS(vm, v_reactivated); GET_VM_STATS(vm, v_pdwakeups); GET_VM_STATS(vm, v_pdpages); GET_VM_STATS(vm, v_pdshortfalls); GET_VM_STATS(vm, v_dfree); GET_VM_STATS(vm, v_pfree); GET_VM_STATS(vm, v_tfree); GET_VM_STATS(vm, v_page_size); GET_VM_STATS(vm, v_page_count); GET_VM_STATS(vm, v_free_reserved); GET_VM_STATS(vm, v_free_target); GET_VM_STATS(vm, v_free_min); GET_VM_STATS(vm, v_free_count); GET_VM_STATS(vm, v_wire_count); GET_VM_STATS(vm, v_active_count); GET_VM_STATS(vm, v_inactive_target); GET_VM_STATS(vm, v_inactive_count); GET_VM_STATS(vm, v_laundry_count); GET_VM_STATS(vm, v_pageout_free_min); GET_VM_STATS(vm, v_interrupt_free_min); /*GET_VM_STATS(vm, v_free_severe);*/ GET_VM_STATS(vm, v_forks); GET_VM_STATS(vm, v_vforks); GET_VM_STATS(vm, v_rforks); GET_VM_STATS(vm, v_kthreads); GET_VM_STATS(vm, v_forkpages); GET_VM_STATS(vm, v_vforkpages); GET_VM_STATS(vm, v_rforkpages); GET_VM_STATS(vm, v_kthreadpages); #undef GET_VM_STATS } } static void fill_vmtotal(struct vmtotal *vmtp) { if (kd != NULL) { /* XXX fill vmtp */ xo_errx(1, "not implemented"); } else { size_t size = sizeof(*vmtp); mysysctl("vm.vmtotal", vmtp, &size); if (size != sizeof(*vmtp)) xo_errx(1, "vm.total size mismatch"); } } /* Determine how many cpu columns, and what index they are in kern.cp_times */ static int getcpuinfo(u_long *maskp, int *maxidp) { int maxcpu; int maxid; int ncpus; int i, j; int empty; size_t size; long *times; u_long mask; if (kd != NULL) xo_errx(1, "not implemented"); mask = 0; ncpus = 0; size = sizeof(maxcpu); mysysctl("kern.smp.maxcpus", &maxcpu, &size); if (size != sizeof(maxcpu)) xo_errx(1, "sysctl kern.smp.maxcpus"); size = sizeof(long) * maxcpu * CPUSTATES; times = malloc(size); if (times == NULL) xo_err(1, "malloc %zd bytes", size); mysysctl("kern.cp_times", times, &size); maxid = (size / CPUSTATES / sizeof(long)) - 1; for (i = 0; i <= maxid; i++) { empty = 1; for (j = 0; empty && j < CPUSTATES; j++) { if (times[i * CPUSTATES + j] != 0) empty = 0; } if (!empty) { mask |= (1ul << i); ncpus++; } } if (maskp) *maskp = mask; if (maxidp) *maxidp = maxid; return (ncpus); } static void -prthuman(const char *name, u_int64_t val, int size) +prthuman(const char *name, uint64_t val, int size) { char buf[10]; int flags; char fmt[128]; snprintf(fmt, sizeof(fmt), "{:%s/%%*s}", name); if (size < 5 || size > 9) xo_errx(1, "doofus"); flags = HN_B | HN_NOSPACE | HN_DECIMAL; humanize_number(buf, size, val, "", HN_AUTOSCALE, flags); xo_attr("value", "%ju", (uintmax_t) val); xo_emit(fmt, size, buf); } static int hz, hdrcnt; static long *cur_cp_times; static long *last_cp_times; static size_t size_cp_times; static void dovmstat(unsigned int interval, int reps) { struct vmtotal total; time_t uptime, halfuptime; struct devinfo *tmp_dinfo; size_t size; int ncpus, maxid; u_long cpumask; int rate_adj; uptime = getuptime() / 1000000000LL; halfuptime = uptime / 2; rate_adj = 1; ncpus = 1; maxid = 0; /* * If the user stops the program (control-Z) and then resumes it, * print out the header again. */ (void)signal(SIGCONT, needhdr); /* * If our standard output is a tty, then install a SIGWINCH handler * and set wresized so that our first iteration through the main * vmstat loop will peek at the terminal's current rows to find out * how many lines can fit in a screenful of output. */ if (isatty(fileno(stdout)) != 0) { wresized = 1; (void)signal(SIGWINCH, needresize); } else { wresized = 0; winlines = VMSTAT_DEFAULT_LINES; } if (kd != NULL) { if (namelist[X_STATHZ].n_type != 0 && namelist[X_STATHZ].n_value != 0) kread(X_STATHZ, &hz, sizeof(hz)); if (!hz) kread(X_HZ, &hz, sizeof(hz)); } else { struct clockinfo clockrate; size = sizeof(clockrate); mysysctl("kern.clockrate", &clockrate, &size); if (size != sizeof(clockrate)) xo_errx(1, "clockrate size mismatch"); hz = clockrate.hz; } if (Pflag) { ncpus = getcpuinfo(&cpumask, &maxid); size_cp_times = sizeof(long) * (maxid + 1) * CPUSTATES; cur_cp_times = calloc(1, size_cp_times); last_cp_times = calloc(1, size_cp_times); } for (hdrcnt = 1;;) { if (!--hdrcnt) printhdr(maxid, cpumask); if (kd != NULL) { if (kvm_getcptime(kd, cur.cp_time) < 0) xo_errx(1, "kvm_getcptime: %s", kvm_geterr(kd)); } else { size = sizeof(cur.cp_time); mysysctl("kern.cp_time", &cur.cp_time, &size); if (size != sizeof(cur.cp_time)) xo_errx(1, "cp_time size mismatch"); } if (Pflag) { size = size_cp_times; mysysctl("kern.cp_times", cur_cp_times, &size); if (size != size_cp_times) xo_errx(1, "cp_times mismatch"); } tmp_dinfo = last.dinfo; last.dinfo = cur.dinfo; cur.dinfo = tmp_dinfo; last.snap_time = cur.snap_time; /* * Here what we want to do is refresh our device stats. * getdevs() returns 1 when the device list has changed. * If the device list has changed, we want to go through * the selection process again, in case a device that we * were previously displaying has gone away. */ switch (devstat_getdevs(NULL, &cur)) { case -1: xo_errx(1, "%s", devstat_errbuf); break; case 1: { int retval; num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; retval = devstat_selectdevs(&dev_select, &num_selected, &num_selections, &select_generation, generation, cur.dinfo->devices, num_devices, matches, num_matches, specified_devices, num_devices_specified, select_mode, maxshowdevs, 0); switch (retval) { case -1: xo_errx(1, "%s", devstat_errbuf); break; case 1: printhdr(maxid, cpumask); break; default: break; } } default: break; } fill_vmmeter(&sum); fill_vmtotal(&total); xo_open_container("processes"); xo_emit("{:runnable/%1d} {:waiting/%ld} " "{:swapped-out/%ld}", total.t_rq - 1, total.t_dw + total.t_pw, total.t_sw); xo_close_container("processes"); xo_open_container("memory"); -#define vmstat_pgtok(a) ((a) * (sum.v_page_size >> 10)) +#define vmstat_pgtok(a) ((uintmax_t)(a) * (sum.v_page_size >> 10)) #define rate(x) (((x) * rate_adj + halfuptime) / uptime) /* round */ if (hflag) { xo_emit(""); prthuman("available-memory", - total.t_avm * (u_int64_t)sum.v_page_size, 5); + total.t_avm * (uint64_t)sum.v_page_size, 5); xo_emit(" "); prthuman("free-memory", - total.t_free * (u_int64_t)sum.v_page_size, 5); + total.t_free * (uint64_t)sum.v_page_size, 5); xo_emit(" "); } else { xo_emit(" "); - xo_emit("{:available-memory/%7d}", + xo_emit("{:available-memory/%7ju}", vmstat_pgtok(total.t_avm)); xo_emit(" "); - xo_emit("{:free-memory/%7d}", + xo_emit("{:free-memory/%7ju}", vmstat_pgtok(total.t_free)); xo_emit(" "); } xo_emit("{:total-page-faults/%5lu} ", (unsigned long)rate(sum.v_vm_faults - osum.v_vm_faults)); xo_close_container("memory"); xo_open_container("paging-rates"); xo_emit("{:page-reactivated/%3lu} ", (unsigned long)rate(sum.v_reactivated - osum.v_reactivated)); xo_emit("{:paged-in/%3lu} ", (unsigned long)rate(sum.v_swapin + sum.v_vnodein - (osum.v_swapin + osum.v_vnodein))); xo_emit("{:paged-out/%3lu} ", (unsigned long)rate(sum.v_swapout + sum.v_vnodeout - (osum.v_swapout + osum.v_vnodeout))); xo_emit("{:freed/%5lu} ", (unsigned long)rate(sum.v_tfree - osum.v_tfree)); xo_emit("{:scanned/%4lu} ", (unsigned long)rate(sum.v_pdpages - osum.v_pdpages)); xo_close_container("paging-rates"); devstats(); xo_open_container("fault-rates"); xo_emit("{:interrupts/%4lu} {:system-calls/%5lu} " "{:context-switches/%5u}", (unsigned long)rate(sum.v_intr - osum.v_intr), (unsigned long)rate(sum.v_syscall - osum.v_syscall), (unsigned long)rate(sum.v_swtch - osum.v_swtch)); xo_close_container("fault-rates"); if (Pflag) pcpustats(ncpus, cpumask, maxid); else cpustats(); xo_emit("\n"); xo_flush(); if (reps >= 0 && --reps <= 0) break; osum = sum; uptime = interval; rate_adj = 1000; /* * We round upward to avoid losing low-frequency events * (i.e., >= 1 per interval but < 1 per millisecond). */ if (interval != 1) halfuptime = (uptime + 1) / 2; else halfuptime = 0; (void)usleep(interval * 1000); } } static void printhdr(int maxid, u_long cpumask) { int i, num_shown; num_shown = MIN(num_selected, maxshowdevs); if (hflag) { xo_emit("{T:procs} {T:memory} {T:/page%*s}", 19, ""); } else { xo_emit("{T:procs} {T:memory} {T:/page%*s}", 19, ""); } if (num_shown > 1) xo_emit(" {T:/disks %*s}", num_shown * 4 - 7, ""); else if (num_shown == 1) xo_emit(" {T:disks}"); xo_emit(" {T:faults} "); if (Pflag) { for (i = 0; i <= maxid; i++) { if (cpumask & (1ul << i)) xo_emit(" {T:/cpu%d} ", i); } xo_emit("\n"); } else xo_emit(" {T:cpu}\n"); if (hflag) { xo_emit("{T:r} {T:b} {T:w} {T:avm} {T:fre} {T:flt} {T:re} {T:pi} {T:po} {T:fr} {T:sr} "); } else { xo_emit("{T:r} {T:b} {T:w} {T:avm} {T:fre} {T:flt} {T:re} {T:pi} {T:po} {T:fr} {T:sr} "); } for (i = 0; i < num_devices; i++) if ((dev_select[i].selected) && (dev_select[i].selected <= maxshowdevs)) xo_emit("{T:/%c%c%d} ", dev_select[i].device_name[0], dev_select[i].device_name[1], dev_select[i].unit_number); xo_emit(" {T:in} {T:sy} {T:cs}"); if (Pflag) { for (i = 0; i <= maxid; i++) { if (cpumask & (1ul << i)) xo_emit(" {T:us} {T:sy} {T:id}"); } xo_emit("\n"); } else xo_emit(" {T:us} {T:sy} {T:id}\n"); if (wresized != 0) doresize(); hdrcnt = winlines; } /* * Force a header to be prepended to the next output. */ static void needhdr(int dummy __unused) { hdrcnt = 1; } /* * When the terminal is resized, force an update of the maximum number of rows * printed between each header repetition. Then force a new header to be * prepended to the next output. */ void needresize(int signo) { wresized = 1; hdrcnt = 1; } /* * Update the global `winlines' count of terminal rows. */ void doresize(void) { int status; struct winsize w; for (;;) { status = ioctl(fileno(stdout), TIOCGWINSZ, &w); if (status == -1 && errno == EINTR) continue; else if (status == -1) xo_err(1, "ioctl"); if (w.ws_row > 3) winlines = w.ws_row - 3; else winlines = VMSTAT_DEFAULT_LINES; break; } /* * Inhibit doresize() calls until we are rescheduled by SIGWINCH. */ wresized = 0; } #ifdef notyet static void dotimes(void) { unsigned int pgintime, rectime; kread(X_REC, &rectime, sizeof(rectime)); kread(X_PGIN, &pgintime, sizeof(pgintime)); kread(X_SUM, &sum, sizeof(sum)); xo_emit("{:page-reclaims/%u} {N:reclaims}, " "{:reclaim-time/%u} {N:total time (usec)}\n", sum.v_pgrec, rectime); xo_emit("{L:average}: {:reclaim-average/%u} {N:usec \\/ reclaim}\n", rectime / sum.v_pgrec); xo_emit("\n"); xo_emit("{:page-ins/%u} {N:page ins}, " "{:page-in-time/%u} {N:total time (msec)}\n", sum.v_pgin, pgintime / 10); xo_emit("{L:average}: {:average/%8.1f} {N:msec \\/ page in}\n", pgintime / (sum.v_pgin * 10.0)); } #endif static long pct(long top, long bot) { long ans; if (bot == 0) return(0); ans = (quad_t)top * 100 / bot; return (ans); } #define PCT(top, bot) pct((long)(top), (long)(bot)) static void dosum(void) { struct nchstats lnchstats; long nchtotal; fill_vmmeter(&sum); xo_open_container("summary-statistics"); xo_emit("{:context-switches/%9u} {N:cpu context switches}\n", sum.v_swtch); xo_emit("{:interrupts/%9u} {N:device interrupts}\n", sum.v_intr); xo_emit("{:software-interrupts/%9u} {N:software interrupts}\n", sum.v_soft); xo_emit("{:traps/%9u} {N:traps}\n", sum.v_trap); xo_emit("{:system-calls/%9u} {N:system calls}\n", sum.v_syscall); xo_emit("{:kernel-threads/%9u} {N:kernel threads created}\n", sum.v_kthreads); xo_emit("{:forks/%9u} {N: fork() calls}\n", sum.v_forks); xo_emit("{:vforks/%9u} {N:vfork() calls}\n", sum.v_vforks); xo_emit("{:rforks/%9u} {N:rfork() calls}\n", sum.v_rforks); xo_emit("{:swap-ins/%9u} {N:swap pager pageins}\n", sum.v_swapin); xo_emit("{:swap-in-pages/%9u} {N:swap pager pages paged in}\n", sum.v_swappgsin); xo_emit("{:swap-outs/%9u} {N:swap pager pageouts}\n", sum.v_swapout); xo_emit("{:swap-out-pages/%9u} {N:swap pager pages paged out}\n", sum.v_swappgsout); xo_emit("{:vnode-page-ins/%9u} {N:vnode pager pageins}\n", sum.v_vnodein); xo_emit("{:vnode-page-in-pages/%9u} {N:vnode pager pages paged in}\n", sum.v_vnodepgsin); xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pageouts}\n", sum.v_vnodeout); xo_emit("{:vnode-page-out-pages/%9u} {N:vnode pager pages paged out}\n", sum.v_vnodepgsout); xo_emit("{:page-daemon-wakeups/%9u} {N:page daemon wakeups}\n", sum.v_pdwakeups); xo_emit("{:page-daemon-pages/%9u} {N:pages examined by the page daemon}\n", sum.v_pdpages); xo_emit("{:page-reclamation-shortfalls/%9u} {N:clean page reclamation shortfalls}\n", sum.v_pdshortfalls); xo_emit("{:reactivated/%9u} {N:pages reactivated by the page daemon}\n", sum.v_reactivated); xo_emit("{:copy-on-write-faults/%9u} {N:copy-on-write faults}\n", sum.v_cow_faults); xo_emit("{:copy-on-write-optimized-faults/%9u} {N:copy-on-write optimized faults}\n", sum.v_cow_optim); xo_emit("{:zero-fill-pages/%9u} {N:zero fill pages zeroed}\n", sum.v_zfod); xo_emit("{:zero-fill-prezeroed/%9u} {N:zero fill pages prezeroed}\n", sum.v_ozfod); xo_emit("{:intransit-blocking/%9u} {N:intransit blocking page faults}\n", sum.v_intrans); xo_emit("{:total-faults/%9u} {N:total VM faults taken}\n", sum.v_vm_faults); xo_emit("{:faults-requiring-io/%9u} {N:page faults requiring I\\/O}\n", sum.v_io_faults); xo_emit("{:faults-from-thread-creation/%9u} {N:pages affected by kernel thread creation}\n", sum.v_kthreadpages); xo_emit("{:faults-from-fork/%9u} {N:pages affected by fork}()\n", sum.v_forkpages); xo_emit("{:faults-from-vfork/%9u} {N:pages affected by vfork}()\n", sum.v_vforkpages); xo_emit("{:pages-rfork/%9u} {N:pages affected by rfork}()\n", sum.v_rforkpages); xo_emit("{:pages-freed/%9u} {N:pages freed}\n", sum.v_tfree); xo_emit("{:pages-freed-by-daemon/%9u} {N:pages freed by daemon}\n", sum.v_dfree); xo_emit("{:pages-freed-on-exit/%9u} {N:pages freed by exiting processes}\n", sum.v_pfree); xo_emit("{:active-pages/%9u} {N:pages active}\n", sum.v_active_count); xo_emit("{:inactive-pages/%9u} {N:pages inactive}\n", sum.v_inactive_count); xo_emit("{:laundry-pages/%9u} {N:pages in the laundry queue}\n", sum.v_laundry_count); xo_emit("{:wired-pages/%9u} {N:pages wired down}\n", sum.v_wire_count); xo_emit("{:free-pages/%9u} {N:pages free}\n", sum.v_free_count); xo_emit("{:bytes-per-page/%9u} {N:bytes per page}\n", sum.v_page_size); if (kd != NULL) { kread(X_NCHSTATS, &lnchstats, sizeof(lnchstats)); } else { size_t size = sizeof(lnchstats); mysysctl("vfs.cache.nchstats", &lnchstats, &size); if (size != sizeof(lnchstats)) xo_errx(1, "vfs.cache.nchstats size mismatch"); } nchtotal = lnchstats.ncs_goodhits + lnchstats.ncs_neghits + lnchstats.ncs_badhits + lnchstats.ncs_falsehits + lnchstats.ncs_miss + lnchstats.ncs_long; xo_emit("{:total-name-lookups/%9ld} {N:total name lookups}\n", nchtotal); xo_emit("{P:/%9s} {N:cache hits} " "({:positive-cache-hits/%ld}% pos + " "{:negative-cache-hits/%ld}% {N:neg}) " "system {:cache-hit-percent/%ld}% per-directory\n", "", PCT(lnchstats.ncs_goodhits, nchtotal), PCT(lnchstats.ncs_neghits, nchtotal), PCT(lnchstats.ncs_pass2, nchtotal)); xo_emit("{P:/%9s} {L:deletions} {:deletions/%ld}%, " "{L:falsehits} {:false-hits/%ld}%, " "{L:toolong} {:too-long/%ld}%\n", "", PCT(lnchstats.ncs_badhits, nchtotal), PCT(lnchstats.ncs_falsehits, nchtotal), PCT(lnchstats.ncs_long, nchtotal)); xo_close_container("summary-statistics"); } static void doforkst(void) { fill_vmmeter(&sum); xo_open_container("fork-statistics"); xo_emit("{:fork/%u} {N:forks}, {:fork-pages/%u} {N:pages}, " "{L:average} {:fork-average/%.2f}\n", sum.v_forks, sum.v_forkpages, sum.v_forks == 0 ? 0.0 : (double)sum.v_forkpages / sum.v_forks); xo_emit("{:vfork/%u} {N:vforks}, {:vfork-pages/%u} {N:pages}, " "{L:average} {:vfork-average/%.2f}\n", sum.v_vforks, sum.v_vforkpages, sum.v_vforks == 0 ? 0.0 : (double)sum.v_vforkpages / sum.v_vforks); xo_emit("{:rfork/%u} {N:rforks}, {:rfork-pages/%u} {N:pages}, " "{L:average} {:rfork-average/%.2f}\n", sum.v_rforks, sum.v_rforkpages, sum.v_rforks == 0 ? 0.0 : (double)sum.v_rforkpages / sum.v_rforks); xo_close_container("fork-statistics"); } static void devstats(void) { int dn, state; long double transfers_per_second; long double busy_seconds; long tmp; for (state = 0; state < CPUSTATES; ++state) { tmp = cur.cp_time[state]; cur.cp_time[state] -= last.cp_time[state]; last.cp_time[state] = tmp; } busy_seconds = cur.snap_time - last.snap_time; xo_open_list("device"); for (dn = 0; dn < num_devices; dn++) { int di; if ((dev_select[dn].selected == 0) || (dev_select[dn].selected > maxshowdevs)) continue; di = dev_select[dn].position; if (devstat_compute_statistics(&cur.dinfo->devices[di], &last.dinfo->devices[di], busy_seconds, DSM_TRANSFERS_PER_SECOND, &transfers_per_second, DSM_NONE) != 0) xo_errx(1, "%s", devstat_errbuf); xo_open_instance("device"); xo_emit("{ekq:name/%c%c%d}{:transfers/%3.0Lf} ", dev_select[dn].device_name[0], dev_select[dn].device_name[1], dev_select[dn].unit_number, transfers_per_second); xo_close_instance("device"); } xo_close_list("device"); } static void percent(const char *name, double pct, int *over) { char buf[10]; char fmt[128]; int l; snprintf(fmt, sizeof(fmt), " {:%s/%%*s}", name); l = snprintf(buf, sizeof(buf), "%.0f", pct); if (l == 1 && *over) { xo_emit(fmt, 1, buf); (*over)--; } else xo_emit(fmt, 2, buf); if (l > 2) (*over)++; } static void cpustats(void) { int state, over; double lpct, total; total = 0; for (state = 0; state < CPUSTATES; ++state) total += cur.cp_time[state]; if (total) lpct = 100.0 / total; else lpct = 0.0; over = 0; xo_open_container("cpu-statistics"); percent("user", (cur.cp_time[CP_USER] + cur.cp_time[CP_NICE]) * lpct, &over); percent("system", (cur.cp_time[CP_SYS] + cur.cp_time[CP_INTR]) * lpct, &over); percent("idle", cur.cp_time[CP_IDLE] * lpct, &over); xo_close_container("cpu-statistics"); } static void pcpustats(int ncpus, u_long cpumask, int maxid) { int state, i; double lpct, total; long tmp; int over; /* devstats does this for cp_time */ for (i = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; for (state = 0; state < CPUSTATES; ++state) { tmp = cur_cp_times[i * CPUSTATES + state]; cur_cp_times[i * CPUSTATES + state] -= last_cp_times[i * CPUSTATES + state]; last_cp_times[i * CPUSTATES + state] = tmp; } } over = 0; xo_open_list("cpu"); for (i = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; xo_open_instance("cpu"); xo_emit("{ke:name/%d}", i); total = 0; for (state = 0; state < CPUSTATES; ++state) total += cur_cp_times[i * CPUSTATES + state]; if (total) lpct = 100.0 / total; else lpct = 0.0; percent("user", (cur_cp_times[i * CPUSTATES + CP_USER] + cur_cp_times[i * CPUSTATES + CP_NICE]) * lpct, &over); percent("system", (cur_cp_times[i * CPUSTATES + CP_SYS] + cur_cp_times[i * CPUSTATES + CP_INTR]) * lpct, &over); percent("idle", cur_cp_times[i * CPUSTATES + CP_IDLE] * lpct, &over); xo_close_instance("cpu"); } xo_close_list("cpu"); } static unsigned int read_intrcnts(unsigned long **intrcnts) { size_t intrcntlen; if (kd != NULL) { kread(X_SINTRCNT, &intrcntlen, sizeof(intrcntlen)); if ((*intrcnts = malloc(intrcntlen)) == NULL) err(1, "malloc()"); kread(X_INTRCNT, *intrcnts, intrcntlen); } else { for (*intrcnts = NULL, intrcntlen = 1024; ; intrcntlen *= 2) { *intrcnts = reallocf(*intrcnts, intrcntlen); if (*intrcnts == NULL) err(1, "reallocf()"); if (mysysctl("hw.intrcnt", *intrcnts, &intrcntlen) == 0) break; } } return (intrcntlen / sizeof(unsigned long)); } static void print_intrcnts(unsigned long *intrcnts, unsigned long *old_intrcnts, char *intrnames, unsigned int nintr, size_t istrnamlen, long long period_ms) { unsigned long *intrcnt, *old_intrcnt; uint64_t inttotal, old_inttotal, total_count, total_rate; char* intrname; unsigned int i; inttotal = 0; old_inttotal = 0; intrname = intrnames; xo_open_list("interrupt"); for (i = 0, intrcnt=intrcnts, old_intrcnt=old_intrcnts; i < nintr; i++) { if (intrname[0] != '\0' && (*intrcnt != 0 || aflag)) { unsigned long count, rate; count = *intrcnt - *old_intrcnt; rate = (count * 1000 + period_ms / 2) / period_ms; xo_open_instance("interrupt"); xo_emit("{d:name/%-*s}{ket:name/%s} " "{:total/%20lu} {:rate/%10lu}\n", (int)istrnamlen, intrname, intrname, count, rate); xo_close_instance("interrupt"); } intrname += strlen(intrname) + 1; inttotal += *intrcnt++; old_inttotal += *old_intrcnt++; } total_count = inttotal - old_inttotal; total_rate = (total_count * 1000 + period_ms / 2) / period_ms; xo_close_list("interrupt"); xo_emit("{L:/%-*s} {:total-interrupts/%20" PRIu64 "} " "{:total-rate/%10" PRIu64 "}\n", (int)istrnamlen, "Total", total_count, total_rate); } static void dointr(unsigned int interval, int reps) { unsigned long *intrcnts; long long uptime, period_ms; unsigned long *old_intrcnts = NULL; size_t clen, inamlen, istrnamlen; char *intrnames, *intrname; uptime = getuptime(); /* Get the names of each interrupt source */ if (kd != NULL) { kread(X_SINTRNAMES, &inamlen, sizeof(inamlen)); if ((intrnames = malloc(inamlen)) == NULL) xo_err(1, "malloc()"); kread(X_INTRNAMES, intrnames, inamlen); } else { for (intrnames = NULL, inamlen = 1024; ; inamlen *= 2) { if ((intrnames = reallocf(intrnames, inamlen)) == NULL) xo_err(1, "reallocf()"); if (mysysctl("hw.intrnames", intrnames, &inamlen) == 0) break; } } /* Determine the length of the longest interrupt name */ intrname = intrnames; istrnamlen = strlen("interrupt"); while(*intrname != '\0') { clen = strlen(intrname); if (clen > istrnamlen) istrnamlen = clen; intrname += strlen(intrname) + 1; } xo_emit("{T:/%-*s} {T:/%20s} {T:/%10s}\n", (int)istrnamlen, "interrupt", "total", "rate"); /* * Loop reps times printing differential interrupt counts. If reps is * zero, then run just once, printing total counts */ xo_open_container("interrupt-statistics"); period_ms = uptime / 1000000; while(1) { unsigned int nintr; long long old_uptime; nintr = read_intrcnts(&intrcnts); /* * Initialize old_intrcnts to 0 for the first pass, so * print_intrcnts will print total interrupts since boot */ if (old_intrcnts == NULL) { old_intrcnts = calloc(nintr, sizeof(unsigned long)); if (old_intrcnts == NULL) xo_err(1, "calloc()"); } print_intrcnts(intrcnts, old_intrcnts, intrnames, nintr, istrnamlen, period_ms); xo_flush(); free(old_intrcnts); old_intrcnts = intrcnts; if (reps >= 0 && --reps <= 0) break; usleep(interval * 1000); old_uptime = uptime; uptime = getuptime(); period_ms = (uptime - old_uptime) / 1000000; } xo_close_container("interrupt-statistics"); } static void domemstat_malloc(void) { struct memory_type_list *mtlp; struct memory_type *mtp; int error, first, i; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } if (kd == NULL) { if (memstat_sysctl_malloc(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_malloc: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); return; } } else { if (memstat_kvm_malloc(mtlp, kd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_malloc: %s", kvm_geterr(kd)); else xo_warnx("memstat_kvm_malloc: %s", memstat_strerror(error)); } } xo_open_container("malloc-statistics"); xo_emit("{T:/%13s} {T:/%5s} {T:/%6s} {T:/%7s} {T:/%8s} {T:Size(s)}\n", "Type", "InUse", "MemUse", "HighUse", "Requests"); xo_open_list("memory"); for (mtp = memstat_mtl_first(mtlp); mtp != NULL; mtp = memstat_mtl_next(mtp)) { if (memstat_get_numallocs(mtp) == 0 && memstat_get_count(mtp) == 0) continue; xo_open_instance("memory"); xo_emit("{k:type/%13s/%s} {:in-use/%5" PRIu64 "} " "{:memory-use/%5" PRIu64 "}{U:K} {:high-use/%7s} " "{:requests/%8" PRIu64 "} ", memstat_get_name(mtp), memstat_get_count(mtp), (memstat_get_bytes(mtp) + 1023) / 1024, "-", memstat_get_numallocs(mtp)); first = 1; xo_open_list("size"); for (i = 0; i < 32; i++) { if (memstat_get_sizemask(mtp) & (1 << i)) { if (!first) xo_emit(","); xo_emit("{l:size/%d}", 1 << (i + 4)); first = 0; } } xo_close_list("size"); xo_close_instance("memory"); xo_emit("\n"); } xo_close_list("memory"); xo_close_container("malloc-statistics"); memstat_mtl_free(mtlp); } static void domemstat_zone(void) { struct memory_type_list *mtlp; struct memory_type *mtp; char name[MEMTYPE_MAXNAME + 1]; int error; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } if (kd == NULL) { if (memstat_sysctl_uma(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_uma: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); return; } } else { if (memstat_kvm_uma(mtlp, kd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_uma: %s", kvm_geterr(kd)); else xo_warnx("memstat_kvm_uma: %s", memstat_strerror(error)); } } xo_open_container("memory-zone-statistics"); xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} " "{T:/%4s} {T:/%4s}\n\n", "ITEM", "SIZE", "LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP"); xo_open_list("zone"); for (mtp = memstat_mtl_first(mtlp); mtp != NULL; mtp = memstat_mtl_next(mtp)) { strlcpy(name, memstat_get_name(mtp), MEMTYPE_MAXNAME); strcat(name, ":"); xo_open_instance("zone"); xo_emit("{d:name/%-20s}{ke:name/%s} {:size/%6" PRIu64 "}, " "{:limit/%6" PRIu64 "},{:used/%8" PRIu64 "}," "{:free/%8" PRIu64 "},{:requests/%8" PRIu64 "}," "{:fail/%4" PRIu64 "},{:sleep/%4" PRIu64 "}\n", name, memstat_get_name(mtp), memstat_get_size(mtp), memstat_get_countlimit(mtp), memstat_get_count(mtp), memstat_get_free(mtp), memstat_get_numallocs(mtp), memstat_get_failures(mtp), memstat_get_sleeps(mtp)); xo_close_instance("zone"); } memstat_mtl_free(mtlp); xo_close_list("zone"); xo_close_container("memory-zone-statistics"); xo_emit("\n"); } static void display_object(struct kinfo_vmobject *kvo) { const char *str; xo_open_instance("object"); - xo_emit("{:resident/%5jd} ", (uintmax_t)kvo->kvo_resident); - xo_emit("{:active/%5jd} ", (uintmax_t)kvo->kvo_active); - xo_emit("{:inactive/%5jd} ", (uintmax_t)kvo->kvo_inactive); + xo_emit("{:resident/%5ju} ", (uintmax_t)kvo->kvo_resident); + xo_emit("{:active/%5ju} ", (uintmax_t)kvo->kvo_active); + xo_emit("{:inactive/%5ju} ", (uintmax_t)kvo->kvo_inactive); xo_emit("{:refcount/%3d} ", kvo->kvo_ref_count); xo_emit("{:shadowcount/%3d} ", kvo->kvo_shadow_count); switch (kvo->kvo_memattr) { #ifdef VM_MEMATTR_UNCACHEABLE case VM_MEMATTR_UNCACHEABLE: str = "UC"; break; #endif #ifdef VM_MEMATTR_WRITE_COMBINING case VM_MEMATTR_WRITE_COMBINING: str = "WC"; break; #endif #ifdef VM_MEMATTR_WRITE_THROUGH case VM_MEMATTR_WRITE_THROUGH: str = "WT"; break; #endif #ifdef VM_MEMATTR_WRITE_PROTECTED case VM_MEMATTR_WRITE_PROTECTED: str = "WP"; break; #endif #ifdef VM_MEMATTR_WRITE_BACK case VM_MEMATTR_WRITE_BACK: str = "WB"; break; #endif #ifdef VM_MEMATTR_WEAK_UNCACHEABLE case VM_MEMATTR_WEAK_UNCACHEABLE: str = "UC-"; break; #endif #ifdef VM_MEMATTR_WB_WA case VM_MEMATTR_WB_WA: str = "WB"; break; #endif #ifdef VM_MEMATTR_NOCACHE case VM_MEMATTR_NOCACHE: str = "NC"; break; #endif #ifdef VM_MEMATTR_DEVICE case VM_MEMATTR_DEVICE: str = "DEV"; break; #endif #ifdef VM_MEMATTR_CACHEABLE case VM_MEMATTR_CACHEABLE: str = "C"; break; #endif #ifdef VM_MEMATTR_PREFETCHABLE case VM_MEMATTR_PREFETCHABLE: str = "PRE"; break; #endif default: str = "??"; break; } xo_emit("{:attribute/%-3s} ", str); switch (kvo->kvo_type) { case KVME_TYPE_NONE: str = "--"; break; case KVME_TYPE_DEFAULT: str = "df"; break; case KVME_TYPE_VNODE: str = "vn"; break; case KVME_TYPE_SWAP: str = "sw"; break; case KVME_TYPE_DEVICE: str = "dv"; break; case KVME_TYPE_PHYS: str = "ph"; break; case KVME_TYPE_DEAD: str = "dd"; break; case KVME_TYPE_SG: str = "sg"; break; case KVME_TYPE_MGTDEVICE: str = "md"; break; case KVME_TYPE_UNKNOWN: default: str = "??"; break; } xo_emit("{:type/%-2s} ", str); xo_emit("{:path/%-s}\n", kvo->kvo_path); xo_close_instance("object"); } static void doobjstat(void) { struct kinfo_vmobject *kvo; int cnt, i; kvo = kinfo_getvmobject(&cnt); if (kvo == NULL) { xo_warn("Failed to fetch VM object list"); return; } xo_emit("{T:RES/%5s} {T:ACT/%5s} {T:INACT/%5s} {T:REF/%3s} {T:SHD/%3s} " "{T:CM/%3s} {T:TP/%2s} {T:PATH/%s}\n"); xo_open_list("object"); for (i = 0; i < cnt; i++) display_object(&kvo[i]); free(kvo); xo_close_list("object"); } /* * kread reads something from the kernel, given its nlist index. */ static void kreado(int nlx, void *addr, size_t size, size_t offset) { const char *sym; if (namelist[nlx].n_type == 0 || namelist[nlx].n_value == 0) { sym = namelist[nlx].n_name; if (*sym == '_') ++sym; xo_errx(1, "symbol %s not defined", sym); } if ((size_t)kvm_read(kd, namelist[nlx].n_value + offset, addr, size) != size) { sym = namelist[nlx].n_name; if (*sym == '_') ++sym; xo_errx(1, "%s: %s", sym, kvm_geterr(kd)); } } static void kread(int nlx, void *addr, size_t size) { kreado(nlx, addr, size, 0); } static char * kgetstr(const char *strp) { int n = 0, size = 1; char *ret = NULL; do { if (size == n + 1) { ret = realloc(ret, size); if (ret == NULL) xo_err(1, "%s: realloc", __func__); size *= 2; } if (kvm_read(kd, (u_long)strp + n, &ret[n], 1) != 1) xo_errx(1, "%s: %s", __func__, kvm_geterr(kd)); } while (ret[n++] != '\0'); return (ret); } static void usage(void) { xo_error("%s%s", "usage: vmstat [-afHhimoPsz] [-M core [-N system]] [-c count] [-n devs]\n", " [-p type,if,pass] [-w wait] [disks] [wait [count]]\n"); xo_finish(); exit(1); } Index: projects/bsd_rdma_4_9 =================================================================== --- projects/bsd_rdma_4_9 (revision 326161) +++ projects/bsd_rdma_4_9 (revision 326162) Property changes on: projects/bsd_rdma_4_9 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r326132-326161