Index: projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb/zdb.c
===================================================================
--- projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb/zdb.c	(revision 326161)
+++ projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb/zdb.c	(revision 326162)
@@ -1,4158 +1,4161 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <zfs_comutil.h>
 #include <libcmdutils.h>
 #undef verify
 #include <libzfs.h>
 
 #define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
 	zio_compress_table[(idx)].ci_name : "UNKNOWN")
 #define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
 	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
 #define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
 	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
 	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
 #define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
 	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
 	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
 
 #ifndef lint
 extern int reference_tracking_enable;
 extern boolean_t zfs_recover;
 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
 extern int zfs_vdev_async_read_max_active;
 #else
 int reference_tracking_enable;
 boolean_t zfs_recover;
 uint64_t zfs_arc_max, zfs_arc_meta_limit;
 int zfs_vdev_async_read_max_active;
 #endif
 
 const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 extern void dump_intent_log(zilog_t *);
 static uint64_t *zopt_object = NULL;
 static int zopt_objects = 0;
 static libzfs_handle_t *g_zfs;
 static uint64_t max_inflight = 1000;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init()
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
 	    "[-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[<poolname> [<object> ...]]\n"
 	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
 	    "[<object> ...]\n"
 	    "\t%s -C [-A] [-U <cache>]\n"
 	    "\t%s -l [-Aqu] <device>\n"
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
 	    "\t%s -O <dataset> <path>\n"
 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
 	    "\t%s -E [-A] word0:word1:...:word15\n"
 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
 	    "<poolname>\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
 	    cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr, "    If object numbers are specified, only "
 	    "those objects are dumped\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -b block statistics\n");
 	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
 	    "all data) blocks\n");
 	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -d dataset(s)\n");
 	(void) fprintf(stderr, "        -D dedup statistics\n");
 	(void) fprintf(stderr, "        -E decode and display block from an "
 	    "embedded block pointer\n");
 	(void) fprintf(stderr, "        -h pool history\n");
 	(void) fprintf(stderr, "        -i intent logs\n");
 	(void) fprintf(stderr, "        -l read label contents\n");
 	(void) fprintf(stderr, "        -L disable leak tracking (do not "
 	    "load spacemaps)\n");
 	(void) fprintf(stderr, "        -m metaslabs\n");
 	(void) fprintf(stderr, "        -M metaslab groups\n");
 	(void) fprintf(stderr, "        -O perform object lookups by path\n");
 	(void) fprintf(stderr, "        -R read and display block from a "
 	    "device\n");
 	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v verbose (applies to all "
 	    "others)\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
 	    "panic recovery (-AA) or both (-AAA)\n");
 	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
 	    "has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -F attempt automatic rewind within "
 	    "safe range of transaction groups\n");
 	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
 	    "exiting\n");
 	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
 	    "specify the maximum number of "
 	    "checksumming I/Os [default is 200]\n");
 	(void) fprintf(stderr, "        -o <variable>=<value> set global "
 	    "variable to an unsigned 32-bit integer value\n");
 	(void) fprintf(stderr, "        -p <path> -- use one or more with "
 	    "-e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -q don't print label contents\n");
 	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
 	    "searching for uberblocks\n");
 	(void) fprintf(stderr, "        -u uberblock\n");
 	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
 	    "cachefile\n");
 	(void) fprintf(stderr, "        -V do verbatim import\n");
 	(void) fprintf(stderr, "        -x <dumpdir> -- "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
 	    "work with dataset)\n\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }
 
 static void
 dump_debug_buffer()
 {
 	if (dump_opt['G']) {
 		(void) printf("\n");
 		zfs_dbgmsg_print("zdb");
 	}
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	dump_debug_buffer();
 
 	exit(1);
 }
 
 /* ARGSUSED */
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 /* ARGSUSED */
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf, sizeof (buf));
 }
 
 const char histo_stars[] = "****************************************";
 const int histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] > max)
 			max = histo[i];
 		if (histo[i] > 0 && i > maxidx)
 			maxidx = i;
 		if (histo[i] > 0 && i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 /*ARGSUSED*/
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 /*ARGSUSED*/
 void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	void *prop;
 	int i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 		(void) zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length, attr.za_num_integers, prop);
 		if (attr.za_integer_length == 1) {
 			(void) printf("%s", (char *)prop);
 		} else {
 			for (i = 0; i < attr.za_num_integers; i++) {
 				switch (attr.za_integer_length) {
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static void
 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	bpobj_phys_t *bpop = data;
 	char bytes[32], comp[32], uncomp[32];
 
 	/* make sure the output won't get truncated */
 	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
 
 	if (bpop == NULL)
 		return;
 
 	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
 	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
 	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
 
 	(void) printf("\t\tnum_blkptrs = %llu\n",
 	    (u_longlong_t)bpop->bpo_num_blkptrs);
 	(void) printf("\t\tbytes = %s\n", bytes);
 	if (size >= BPOBJ_SIZE_V1) {
 		(void) printf("\t\tcomp = %s\n", comp);
 		(void) printf("\t\tuncomp = %s\n", uncomp);
 	}
 	if (size >= sizeof (*bpop)) {
 		(void) printf("\t\tsubobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_subobjs);
 		(void) printf("\t\tnum_subobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_subobjs);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
 		char blkbuf[BP_SPRINTF_LEN];
 		blkptr_t bp;
 
 		int err = dmu_read(os, object,
 		    i * sizeof (bp), sizeof (bp), &bp, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			break;
 		}
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 }
 
 /* ARGSUSED */
 static void
 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dmu_object_info_t doi;
 
 	VERIFY0(dmu_object_info(os, object, &doi));
 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
 
 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
 	if (err != 0) {
 		(void) printf("got error %u from dmu_read\n", err);
 		kmem_free(subobjs, doi.doi_max_offset);
 		return;
 	}
 
 	int64_t last_nonzero = -1;
 	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
 		if (subobjs[i] != 0)
 			last_nonzero = i;
 	}
 
 	for (int64_t i = 0; i <= last_nonzero; i++) {
 		(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
 	}
 	kmem_free(subobjs, doi.doi_max_offset);
 }
 
 /*ARGSUSED*/
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 /*ARGSUSED*/
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attr.za_first_integer,
 		    (int)ATTR_LENGTH(attr.za_first_integer),
 		    (int)ATTR_BSWAP(attr.za_first_integer),
 		    (int)ATTR_NUM(attr.za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	uint16_t *layout_attrs;
 	int i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attr.za_integer_length == 2);
 		layout_attrs = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length,
 		    attr.za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attr.za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*ARGSUSED*/
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 }
 
 int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && !vd->vdev_removing) {
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (int c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	uint64_t alloc, offset, entry;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 			    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	alloc = 0;
 	for (offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (entry)) {
 		uint8_t mapshift = sm->sm_shift;
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (entry), &entry, DMU_READ_PREFETCH));
 		if (SM_DEBUG_DECODE(entry)) {
 
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
 			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
 			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
 		} else {
 			(void) printf("\t    [%6llu]    %c  range:"
 			    " %010llx-%010llx  size: %06llx\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
 			    mapshift) + sm->sm_start),
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
 			    mapshift) + sm->sm_start +
 			    (SM_RUN_DECODE(entry) << mapshift)),
 			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
 			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
 				alloc += SM_RUN_DECODE(entry) << mapshift;
 			else
 				alloc -= SM_RUN_DECODE(entry) << mapshift;
 		}
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
 		    "with space map summary (%llu)\n",
 		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	range_tree_t *rt = msp->ms_tree;
 	avl_tree_t *t = &msp->ms_size_tree;
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
 	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
 
 	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", avl_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
 	    sizeof (freebuf));
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
 			VERIFY0(metaslab_load(msp));
 			range_tree_stat_verify(msp->ms_tree);
 		}
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
 		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 
 		mutex_enter(&msp->ms_lock);
 		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 		mutex_exit(&msp->ms_lock);
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
 	    (u_longlong_t)vd->vdev_id,
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %10s\n",
 	    "---------------", "-------------------",
 	    "---------------", "-------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg->mg_class != mc)
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
 		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
 	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (!dump_opt['d'] && zopt_objects > 0) {
 		c = zopt_object[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_objects > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_objects; m++) {
 				if (zopt_object[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_object[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_object[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
 	const ddt_phys_t *ddp = dde->dde_phys;
 	const ddt_key_t *ddk = &dde->dde_key;
 	char *types[4] = { "ditto", "single", "double", "triple" };
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu %s %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
 		    types[p], blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 {
 	char name[DDT_NAMELEN];
 	ddt_entry_t dde;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT(error == 0);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT(error == 0);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
 	    name,
 	    (u_longlong_t)count,
 	    (u_longlong_t)(dspace / count),
 	    (u_longlong_t)(mspace / count));
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
 	ASSERT(error == ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = { 0 };
 	ddt_stat_t dds_total = { 0 };
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
 				dump_ddt(ddt, type, class);
 			}
 		}
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_t *rt = vd->vdev_dtl[t];
 		if (range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		mutex_enter(rt->rt_lock);
 		range_tree_walk(rt, dump_dtl_seg, prefix);
 		mutex_exit(rt->rt_lock);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 /* from spa_history.c: spa_history_create_obj() */
 #define	HIS_BUF_LEN_DEF	(128 << 10)
 #define	HIS_BUF_LEN_MAX	(1 << 30)
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf = NULL;
 	uint64_t bufsize = HIS_BUF_LEN_DEF;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	time_t tsec;
 	struct tm t;
 	char tbuf[30];
 	char internalstr[MAXPATHLEN];
 
 	if ((buf = malloc(bufsize)) == NULL)
 		(void) fprintf(stderr, "Unable to read history: "
 		    "out of memory\n");
 	do {
 		len = bufsize;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 		off -= resid;
 
 		/*
 		 * If the history block is too big, double the buffer
 		 * size and try again.
 		 */
 		if (resid == len) {
 			free(buf);
 			buf = NULL;
 
 			bufsize <<= 1;
 			if ((bufsize >= HIS_BUF_LEN_MAX) ||
 			    ((buf = malloc(bufsize)) == NULL)) {
 				(void) fprintf(stderr, "Unable to read history: "
 				    "out of memory\n");
 				return;
 			}
 		}
 	} while (len != 0);
 	free(buf);
 
 	(void) printf("\nHistory:\n");
 	for (int i = 0; i < num; i++) {
 		uint64_t time, txg, ievent;
 		char *cmd, *intstr;
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
 		    &time) != 0)
 			goto next;
 		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
 		    &cmd) != 0) {
 			if (nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
 				goto next;
 			verify(nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG, &txg) == 0);
 			verify(nvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR, &intstr) == 0);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) snprintf(internalstr,
 			    sizeof (internalstr),
 			    "[internal %s txg:%lld] %s",
 			    zfs_history_event_names[ievent], txg,
 			    intstr);
 			cmd = internalstr;
 		}
 		tsec = time;
 		(void) localtime_r(&tsec, &t);
 		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		(void) printf("%s %s\n", tbuf, cmd);
 		printed = B_TRUE;
 
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 }
 
 /*ARGSUSED*/
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
 		    (u_longlong_t)bp->blk_birth);
 		return;
 	}
 
 	blkbuf[0] = '\0';
 	for (int i = 0; i < ndvas; i++)
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)bp->blk_birth);
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)bp->blk_birth,
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
 	}
 }
 
 static void
 print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
 	if (bp->blk_birth == 0)
 		return (0);
 
 	print_indirect(bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
 		arc_buf_destroy(buf, &buf);
 	}
 
 	return (err);
 }
 
 /*ARGSUSED*/
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	int j;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 /*ARGSUSED*/
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
 	    sizeof (nice)); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 }
 
 /*ARGSUSED*/
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
 	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
 	    sizeof (uncompressed));
 	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 /* ARGSUSED */
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (bp->blk_birth != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 /* ARGSUSED */
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	ASSERT(bp->blk_birth != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
 		(void) printf("    %*s: object %llu, %llu local blkptrs, "
 		    "%llu subobjs in object %llu, %s (%s/%s comp)\n",
 		    indent * 8, name,
 		    (u_longlong_t)bpo->bpo_object,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 		    bytes, comp, uncomp);
 
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
 		    indent * 8, name,
 		    (u_longlong_t)bpo->bpo_object,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    bytes);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_deadlist(dsl_deadlist_t *dl)
 {
 	dsl_deadlist_entry_t *dle;
 	uint64_t unused;
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
 	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
 	    bytes, comp, uncomp);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) printf("\n");
 
 	/* force the tree to be loaded */
 	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
 
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		if (dump_opt['d'] >= 5) {
 			char buf[128];
 			(void) snprintf(buf, sizeof (buf),
 			    "mintxg %llu -> obj %llu",
 			    (longlong_t)dle->dle_mintxg,
 			    (longlong_t)dle->dle_bpobj.bpo_object);
 			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
 		} else {
 			(void) printf("mintxg %llu -> obj %llu\n",
 			    (longlong_t)dle->dle_mintxg,
 			    (longlong_t)dle->dle_bpobj.bpo_object);
 		}
 	}
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static objset_t *sa_os = NULL;
 static sa_attr_type_t *sa_attr_table = NULL;
 
 static int
 open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp)
 {
 	int err;
 	uint64_t sa_attrs = 0;
 	uint64_t version = 0;
 
 	VERIFY3P(sa_os, ==, NULL);
 	err = dmu_objset_own(path, type, B_TRUE, tag, osp);
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
 		    strerror(err));
 		return (err);
 	}
 
 	if (dmu_objset_type(*osp) == DMU_OST_ZFS) {
 		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version);
 		if (version >= ZPL_VERSION_SA) {
 			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs);
 		}
 		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
 		    &sa_attr_table);
 		if (err != 0) {
 			(void) fprintf(stderr, "sa_setup failed: %s\n",
 			    strerror(err));
 			dmu_objset_disown(*osp, tag);
 			*osp = NULL;
 		}
 	}
 	sa_os = *osp;
 
 	return (0);
 }
 
 static void
 close_objset(objset_t *os, void *tag)
 {
 	VERIFY3P(os, ==, sa_os);
 	if (os->os_sa != NULL)
 		sa_tear_down(os);
 	dmu_objset_disown(os, tag);
 	sa_attr_table = NULL;
 	sa_os = NULL;
 }
 
 static void
 fuid_table_destroy()
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		char *domain;
 
 		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 /*ARGSUSED*/
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	VERIFY3P(os, ==, sa_os);
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	if (dump_opt['d'] > 4) {
 		error = zfs_obj_to_path(os, object, path, sizeof (path));
 		if (error != 0) {
 			(void) snprintf(path, sizeof (path),
 			    "\?\?\?<object#%llu>", (u_longlong_t)object);
 		}
 		(void) printf("\tpath	%s\n", path);
 	}
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	sa_handle_destroy(hdl);
 }
 
 /*ARGSUSED*/
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 /*ARGSUSED*/
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bpobj			*/
 	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group used		*/
 	dump_zap,		/* ZFS user/group quota		*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
 	CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
 		    "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 	} else {
 		error = dmu_bonus_hold(os, object, FTAG, &db);
 		if (error)
 			fatal("dmu_bonus_hold(%llu) failed, errno %u",
 			    object, error);
 		bonus = db->db_data;
 		bsize = db->db_size;
 		dn = DB_DNODE((dmu_buf_impl_t *)db);
 	}
 	dmu_object_info_from_dnode(dn, &doi);
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
 	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
 	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
 	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
 	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
 	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
 	    doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
 		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
 		    ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", bonus_size, "bonus",
 		    ZDB_OT_NAME(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
 		    bonus, bsize);
 		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
 		*print_header = 1;
 	}
 
 	if (verbosity >= 5)
 		dump_indirect(dn);
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			/* make sure nicenum has enough space */
 			CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize, sizeof (segsize));
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 }
 
 static char *objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 static void
 dump_dir(objset_t *os)
 {
 	dmu_objset_stats_t dds;
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	int print_header = 1;
 	int i, error;
 
 	/* make sure nicenum has enough space */
 	CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
 		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf);
 
 	if (zopt_objects != 0) {
 		for (i = 0; i < zopt_objects; i++)
 			dump_object(os, zopt_object[i], verbosity,
 			    &print_header);
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
 	}
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header);
 		object_count++;
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf(header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
 	if (dump_opt['u'] >= 3) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf(footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) fprintf(stderr, "cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
 {
 	vdev_t vd;
 	vdev_t *vdp = &vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vdp->vdev_top = vdp;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
 		uberblock_t *ub = (void *)((char *)lbl + uoff);
 
 		if (uberblock_verify(ub))
 			continue;
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 	}
 }
 
 static char curpath[PATH_MAX];
 
 /*
  * Iterate through the path components, recursively passing
  * current one's obj and remaining path until we find the obj
  * for the last one.
  */
 static int
 dump_path_impl(objset_t *os, uint64_t obj, char *name)
 {
 	int err;
 	int header = 1;
 	uint64_t child_obj;
 	char *s;
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 
 	if ((s = strchr(name, '/')) != NULL)
 		*s = '\0';
 	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
 
 	(void) strlcat(curpath, name, sizeof (curpath));
 
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to lookup %s: %s\n",
 		    curpath, strerror(err));
 		return (err);
 	}
 
 	child_obj = ZFS_DIRENT_OBJ(child_obj);
 	err = sa_buf_hold(os, child_obj, FTAG, &db);
 	if (err != 0) {
 		(void) fprintf(stderr,
 		    "failed to get SA dbuf for obj %llu: %s\n",
 		    (u_longlong_t)child_obj, strerror(err));
 		return (EINVAL);
 	}
 	dmu_object_info_from_db(db, &doi);
 	sa_buf_rele(db, FTAG);
 
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) {
 		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
 		    doi.doi_bonus_type, (u_longlong_t)child_obj);
 		return (EINVAL);
 	}
 
 	if (dump_opt['v'] > 6) {
 		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
 		    (u_longlong_t)child_obj, curpath, doi.doi_type,
 		    doi.doi_bonus_type);
 	}
 
 	(void) strlcat(curpath, "/", sizeof (curpath));
 
 	switch (doi.doi_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (s != NULL && *(s + 1) != '\0')
 			return (dump_path_impl(os, child_obj, s + 1));
 		/*FALLTHROUGH*/
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		dump_object(os, child_obj, dump_opt['v'], &header);
 		return (0);
 	default:
 		(void) fprintf(stderr, "object %llu has non-file/directory "
 		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
 		break;
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Dump the blocks for the object specified by path inside the dataset.
  */
 static int
 dump_path(char *ds, char *path)
 {
 	int err;
 	objset_t *os;
 	uint64_t root_obj;
 
 	err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
 	if (err != 0)
 		return (err);
 
 	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
 	if (err != 0) {
 		(void) fprintf(stderr, "can't lookup root znode: %s\n",
 		    strerror(err));
 		dmu_objset_disown(os, FTAG);
 		return (EINVAL);
 	}
 
 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
 
 	err = dump_path_impl(os, root_obj, path);
 
 	close_objset(os, FTAG);
 	return (err);
 }
 
 static int
 dump_label(const char *dev)
 {
 	int fd;
 	vdev_label_t label;
 	char path[MAXPATHLEN];
 	char *buf = label.vl_vdev_phys.vp_nvlist;
 	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
 	struct stat64 statbuf;
 	uint64_t psize, ashift;
 	boolean_t label_found = B_FALSE;
 
 	(void) strlcpy(path, dev, sizeof (path));
 	if (dev[0] == '/') {
 		if (strncmp(dev, ZFS_DISK_ROOTD,
 		    strlen(ZFS_DISK_ROOTD)) == 0) {
 			(void) snprintf(path, sizeof (path), "%s%s",
 			    ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
 		}
 	} else if (stat64(path, &statbuf) != 0) {
 		char *s;
 
 		(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
 		    dev);
 		if (((s = strrchr(dev, 's')) == NULL &&
 		    (s = strchr(dev, 'p')) == NULL) ||
 		    !isdigit(*(s + 1)))
 			(void) strlcat(path, "s0", sizeof (path));
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) fprintf(stderr, "cannot open '%s': %s\n", path,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) fprintf(stderr, "failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		(void) close(fd);
 		exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode)) {
 		(void) fprintf(stderr,
 		    "cannot use '%s': character device required\n", path);
 		(void) close(fd);
 		exit(1);
 	}
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *config = NULL;
 
 		if (!dump_opt['q']) {
 			(void) printf("------------------------------------\n");
 			(void) printf("LABEL %d\n", l);
 			(void) printf("------------------------------------\n");
 		}
 
 		if (pread64(fd, &label, sizeof (label),
 		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
 			if (!dump_opt['q'])
 				(void) printf("failed to read label %d\n", l);
 			continue;
 		}
 
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
 			if (!dump_opt['q'])
 				(void) printf("failed to unpack label %d\n", l);
 			ashift = SPA_MINBLOCKSHIFT;
 		} else {
 			nvlist_t *vdev_tree = NULL;
 
 			if (!dump_opt['q'])
 				dump_nvlist(config, 4);
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 			nvlist_free(config);
 			label_found = B_TRUE;
 		}
 		if (dump_opt['u'])
 			dump_label_uberblocks(&label, ashift);
 	}
 
 	(void) close(fd);
 
 	return (label_found ? 0 : 2);
 }
 
 static uint64_t dataset_feature_count[SPA_FEATURES];
 
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
 {
 	int error;
 	objset_t *os;
 
 	error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
 	if (error != 0)
 		return (0);
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
 			continue;
 		ASSERT(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET);
 		dataset_feature_count[f]++;
 	}
 
 	dump_dir(os);
 	close_objset(os, FTAG);
 	fuid_table_destroy();
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE];
 	uint64_t	zcb_start;
 	uint64_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 } zdb_cb_t;
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	uint64_t refcnt = 0;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0)
 				zb->zb_ditto_samevdev++;
 			break;
 		}
 
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 
 	if (dump_opt['L'])
 		return;
 
 	if (BP_GET_DEDUP(bp)) {
 		ddt_t *ddt;
 		ddt_entry_t *dde;
 
 		ddt = ddt_select(zcb->zcb_spa, bp);
 		ddt_enter(ddt);
 		dde = ddt_lookup(ddt, bp, B_FALSE);
 
 		if (dde == NULL) {
 			refcnt = 0;
 		} else {
 			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
 			ddt_phys_decref(ddp);
 			refcnt = ddp->ddp_refcnt;
 			if (ddt_phys_total_refcnt(dde) == 0)
 				ddt_remove(ddt, dde);
 		}
 		ddt_exit(ddt);
 	}
 
 	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
 /* ARGSUSED */
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /* ARGSUSED */
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (bp == NULL)
 		return (0);
 
 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > max_inflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		int kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		int sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		/* make sure nicenum has enough space */
 		CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
 
 		zfs_nicenum(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4dMB/s) "
 		    "estimated time remaining: %uhr %02umin %02usec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static void
 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	ddt_bookmark_t ddb = { 0 };
 	ddt_entry_t dde;
 	int error;
 
 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
 		blkptr_t blk;
 		ddt_phys_t *ddp = dde.dde_phys;
 
 		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
 			return;
 
 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0)
 				continue;
 			ddt_bp_create(ddb.ddb_checksum,
 			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
 				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
 				zcb->zcb_dedup_blocks++;
 			}
 		}
 		if (!dump_opt['L']) {
 			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
 			ddt_enter(ddt);
 			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
 			ddt_exit(ddt);
 		}
 	}
 
 	ASSERT(error == ENOENT);
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		/*
 		 * We are going to be changing the meaning of the metaslab's
 		 * ms_tree.  Ensure that the allocator doesn't try to
 		 * use the tree.
 		 */
 		spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 		spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 			metaslab_group_t *mg = vd->vdev_mg;
 			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				ASSERT3P(msp->ms_group, ==, mg);
 				mutex_enter(&msp->ms_lock);
 				metaslab_unload(msp);
 
 				/*
 				 * For leak detection, we overload the metaslab
 				 * ms_tree to contain allocated segments
 				 * instead of free segments. As a result,
 				 * we can't use the normal metaslab_load/unload
 				 * interfaces.
 				 */
 				if (msp->ms_sm != NULL) {
 					(void) fprintf(stderr,
 					    "\rloading space map for "
 					    "vdev %llu of %llu, "
 					    "metaslab %llu of %llu ...",
 					    (longlong_t)c,
 					    (longlong_t)rvd->vdev_children,
 					    (longlong_t)m,
 					    (longlong_t)vd->vdev_ms_count);
 
 					/*
 					 * We don't want to spend the CPU
 					 * manipulating the size-ordered
 					 * tree, so clear the range_tree
 					 * ops.
 					 */
 					msp->ms_tree->rt_ops = NULL;
 					VERIFY0(space_map_load(msp->ms_sm,
 					    msp->ms_tree, SM_ALLOC));
 
 					if (!msp->ms_loaded) {
 						msp->ms_loaded = B_TRUE;
 					}
 				}
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 		(void) fprintf(stderr, "\n");
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	zdb_ddt_leak_init(spa, zcb);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 zdb_leak_fini(spa_t *spa)
 {
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 			metaslab_group_t *mg = vd->vdev_mg;
 			for (int m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				ASSERT3P(mg, ==, msp->ms_group);
 				mutex_enter(&msp->ms_lock);
 
 				/*
 				 * The ms_tree has been overloaded to
 				 * contain allocated segments. Now that we
 				 * finished traversing all blocks, any
 				 * block that remains in the ms_tree
 				 * represents an allocated block that we
 				 * did not claim during the traversal.
 				 * Claimed blocks would have been removed
 				 * from the ms_tree.
 				 */
 				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
 
 				if (msp->ms_loaded) {
 					msp->ms_loaded = B_FALSE;
 				}
 
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 	}
 }
 
 /* ARGSUSED */
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t zcb = { 0 };
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * Load all space maps as SM_ALLOC maps, then traverse the pool
 	 * claiming each block we discover.  If the pool is perfectly
 	 * consistent, the space maps will be empty when we're done.
 	 * Anything left over is a leak; any block we can't claim (because
 	 * it's not part of any space map) is a double allocation,
 	 * reference to a freed block, or an unclaimed log block.
 	 */
 	zdb_leak_init(spa, &zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    count_block_cb, &zcb, NULL);
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    count_block_cb, &zcb, NULL);
 	}
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    &zcb, NULL));
 	}
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
 	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (int i = 0; i < max_ncpus; i++) {
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 
 	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (int e = 0; e < 256; e++) {
 			if (zcb.zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb.zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	zdb_leak_fini(spa);
 
 	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
 	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
 
 	if (total_found == total_alloc) {
 		if (!dump_opt['L'])
 			(void) printf("\n\tNo leaks (block sum matches space"
 			    " maps exactly)\n");
 	} else {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0)
 		return (2);
 
 	(void) printf("\n");
 	(void) printf("\tbp count:      %10llu\n",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\tganged count:  %10llu\n",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\tbp physical:   %10llu      avg:"
 	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\tbp allocated:  %10llu      avg:"
 	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\tbp deduped:    %10llu    ref>1:"
 	    " %6llu   deduplication: %6.2f\n",
 	    (u_longlong_t)zcb.zcb_dedup_asize,
 	    (u_longlong_t)zcb.zcb_dedup_blocks,
 	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb.zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb.zcb_embedded_histogram[i],
 			    sizeof (zcb.zcb_embedded_histogram[i]) /
 			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			char csize[32], lsize[32], psize[32], asize[32];
 			char avg[32], gang[32];
 			char *typename;
 
 			/* make sure nicenum has enough space */
 			CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
 			CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
 			CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
 			CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
 			CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
 			CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb.zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize,
 				    sizeof (csize));
 				zdb_nicenum(zb->zb_lsize, lsize,
 				    sizeof (lsize));
 				zdb_nicenum(zb->zb_psize, psize,
 				    sizeof (psize));
 				zdb_nicenum(zb->zb_asize, asize,
 				    sizeof (asize));
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
 				    sizeof (avg));
 				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 	}
 
 	(void) printf("\n");
 
 	if (leaks)
 		return (2);
 
 	if (zcb.zcb_haderrors)
 		return (3);
 
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 /* ARGSUSED */
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = { 0 };
 	ddt_stat_t dds_total = { 0 };
 
 	avl_create(&t, ddt_entry_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 	    zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
 		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
 		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
 		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
 
 		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
 		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
 		dds.dds_ref_psize = zdde->zdde_ref_psize;
 		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
 
 		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
 		    &dds, 0);
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_stat(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_full_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_full_bpobj(
 				    &spa->spa_dsl_pool->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    spa->spa_dsl_pool->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 			uint64_t refcount;
 
 			if (!(spa_feature_table[f].fi_flags &
 			    ZFEATURE_FLAG_PER_DATASET)) {
 				ASSERT0(dataset_feature_count[f]);
 				continue;
 			}
 			(void) feature_get_refcount(spa,
 			    &spa_feature_table[f], &refcount);
 			if (dataset_feature_count[f] != refcount) {
 				(void) printf("%s feature refcount mismatch: "
 				    "%lld datasets != %lld refcount\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)dataset_feature_count[f],
 				    (longlong_t)refcount);
 				rc = 2;
 			} else {
 				(void) printf("Verified %s feature refcount "
 				    "of %llu is correct\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)refcount);
 			}
 		}
 	}
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc != 0) {
 		dump_debug_buffer();
 		exit(rc);
 	}
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_PHYS		0x0020
 #define	ZDB_FLAG_RAW		0x0040
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0080
 
 int flagbits[256];
 
 static void
 zdb_print_blkptr(blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	(void) write(1, buf, size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	int nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	int i, j;
 	char *hdr, *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the heirarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, char *path)
 {
 	char *s, *p, *q;
 	int i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = (int)strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i < 0 || i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (*s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 /* ARGSUSED */
 static int
 random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
 {
 	return (random_get_pseudo_bytes(buf, len));
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:size[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		*c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 p: Do I/O to physical offset
  *		 r: Dump raw data to stdout
  *
  *              * = not yet implemented
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	abd_t *pabd;
 	void *lbuf, *buf;
 	char *s, *p, *dup, *vdev, *flagstr;
 	int i, error;
 
 	dup = strdup(thing);
 	s = strtok(dup, ":");
 	vdev = s ? s : "";
 	s = strtok(NULL, ":");
 	offset = strtoull(s ? s : "", NULL, 16);
 	s = strtok(NULL, ":");
 	size = strtoull(s ? s : "", NULL, 16);
 	s = strtok(NULL, ":");
 	flagstr = s ? s : "";
 
 	s = NULL;
 	if (size == 0)
 		s = "size must not be zero";
 	if (!IS_P2ALIGNED(size, DEV_BSIZE))
 		s = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		s = "offset must be a multiple of sector size";
 	if (s) {
 		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
 		free(dup);
 		return;
 	}
 
 	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
 		for (i = 0; flagstr[i]; i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Invalid flag: %c\n",
 				    flagstr[i]);
 				continue;
 			}
 			flags |= bit;
 
 			/* If it's not something with an argument, keep going */
 			if ((bit & (ZDB_FLAG_CHECKSUM |
 			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
 				continue;
 
 			p = &flagstr[i + 1];
 			if (bit == ZDB_FLAG_PRINT_BLKPTR)
 				blkptr_offset = strtoull(p, &p, 16);
 			if (*p != ':' && *p != '\0') {
 				(void) printf("***Invalid flag arg: '%s'\n", s);
 				free(dup);
 				return;
 			}
 			i += p - &flagstr[i + 1]; /* skip over the number */
 		}
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		free(dup);
 		return;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	psize = size;
 	lsize = size;
 
 	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
 	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
 		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		/*
 		 * We don't know how the data was compressed, so just try
 		 * every decompress function at every inflated blocksize.
 		 */
 		enum zio_compress c;
 		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 		abd_copy_to_buf(pbuf2, pabd, psize);
 
 		VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
 		    random_get_pseudo_bytes_cb, NULL));
 
 		VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
 		    SPA_MAXBLOCKSIZE - psize));
 
 		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
 		    lsize -= SPA_MINBLOCKSIZE) {
 			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
 				if (zio_decompress_data(c, pabd,
 				    lbuf, psize, lsize) == 0 &&
 				    zio_decompress_data_buf(c, pbuf2,
 				    lbuf2, psize, lsize) == 0 &&
 				    bcmp(lbuf, lbuf2, lsize) == 0)
 					break;
 			}
 			if (c != ZIO_COMPRESS_FUNCTIONS)
 				break;
 			lsize -= SPA_MINBLOCKSIZE;
 		}
 
 		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
 		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 		if (lsize <= psize) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 		buf = lbuf;
 		size = lsize;
 	} else {
 		buf = abd_to_buf(pabd);
 		size = psize;
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, size, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
 		    flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, flags);
 	else
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
 	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }
 
 static void
 zdb_embedded_block(char *thing)
 {
 	blkptr_t bp = { 0 };
 	unsigned long long *words = (void *)&bp;
-	char buf[SPA_MAXBLOCKSIZE];
+	char *buf;
 	int err;
 
 	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
 	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
 	    words + 0, words + 1, words + 2, words + 3,
 	    words + 4, words + 5, words + 6, words + 7,
 	    words + 8, words + 9, words + 10, words + 11,
 	    words + 12, words + 13, words + 14, words + 15);
 	if (err != 16) {
 		(void) printf("invalid input format\n");
 		exit(1);
 	}
 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
+	buf = malloc(SPA_MAXBLOCKSIZE);
 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
 	if (err != 0) {
 		(void) printf("decode failed: %u\n", err);
+		free(buf);
 		exit(1);
 	}
 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
+	free(buf);
 }
 
 static boolean_t
 pool_match(nvlist_t *cfg, char *tgt)
 {
 	uint64_t v, guid = strtoull(tgt, NULL, 0);
 	char *s;
 
 	if (guid != 0) {
 		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
 			return (v == guid);
 	} else {
 		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
 			return (strcmp(s, tgt) == 0);
 	}
 	return (B_FALSE);
 }
 
 static char *
 find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
 {
 	nvlist_t *pools;
 	nvlist_t *match = NULL;
 	char *name = NULL;
 	char *sepp = NULL;
 	char sep = '\0';
 	int count = 0;
 	importargs_t args = { 0 };
 
 	args.paths = dirc;
 	args.path = dirv;
 	args.can_be_active = B_TRUE;
 
 	if ((sepp = strpbrk(*target, "/@")) != NULL) {
 		sep = *sepp;
 		*sepp = '\0';
 	}
 
 	pools = zpool_search_import(g_zfs, &args);
 
 	if (pools != NULL) {
 		nvpair_t *elem = NULL;
 		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
 			verify(nvpair_value_nvlist(elem, configp) == 0);
 			if (pool_match(*configp, *target)) {
 				count++;
 				if (match != NULL) {
 					/* print previously found config */
 					if (name != NULL) {
 						(void) printf("%s\n", name);
 						dump_nvlist(match, 8);
 						name = NULL;
 					}
 					(void) printf("%s\n",
 					    nvpair_name(elem));
 					dump_nvlist(*configp, 8);
 				} else {
 					match = *configp;
 					name = nvpair_name(elem);
 				}
 			}
 		}
 	}
 	if (count > 1)
 		(void) fatal("\tMatched %d pools - use pool GUID "
 		    "instead of pool name or \n"
 		    "\tpool name part of a dataset name to select pool", count);
 
 	if (sepp)
 		*sepp = sep;
 	/*
 	 * If pool GUID was specified for pool id, replace it with pool name
 	 */
 	if (name && (strstr(*target, name) != *target)) {
 		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
 
 		*target = umem_alloc(sz, UMEM_NOFAIL);
 		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
 	}
 
 	*configp = name ? match : NULL;
 
 	return (name);
 }
 
 int
 main(int argc, char **argv)
 {
 	int i, c;
 	struct rlimit rl = { 1024, 1024 };
 	spa_t *spa = NULL;
 	objset_t *os = NULL;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target;
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env;
 	boolean_t target_is_spa = B_TRUE;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
 	 * default spa_config_path setting. If -U flag is specified it will
 	 * override this environment variable settings once again.
 	 */
 	spa_config_path_env = getenv("SPA_CONFIG_PATH");
 	if (spa_config_path_env != NULL)
 		spa_config_path = spa_config_path_env;
 
 	while ((c = getopt(argc, argv,
 	    "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
 		case 'C':
 		case 'd':
 		case 'D':
 		case 'E':
 		case 'G':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 'M':
 		case 'O':
 		case 'R':
 		case 's':
 		case 'S':
 		case 'u':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'e':
 		case 'F':
 		case 'L':
 		case 'P':
 		case 'q':
 		case 'X':
 			dump_opt[c]++;
 			break;
 		/* NB: Sort single match options below. */
 		case 'I':
 			max_inflight = strtoull(optarg, NULL, 0);
 			if (max_inflight == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight I/Os must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'o':
 			error = set_global_var(optarg);
 			if (error != 0)
 				usage();
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				bcopy(searchdirs, tmp, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			spa_config_path = optarg;
 			if (spa_config_path[0] != '/') {
 				(void) fprintf(stderr,
 				    "cachefile must be an absolute path "
 				    "(i.e. start with a slash)\n");
 				usage();
 			}
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'V':
 			flags = ZFS_IMPORT_VERBATIM;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	/*
 	 * Disable reference tracking for better performance.
 	 */
 	reference_tracking_enable = B_FALSE;
 
 	kernel_init(FREAD);
 	g_zfs = libzfs_init();
 	if (g_zfs == NULL)
 		fatal("Fail to initialize zfs");
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2 && dump_opt['R'])
 		usage();
 
 	if (dump_opt['E']) {
 		if (argc != 1)
 			usage();
 		zdb_embedded_block(argv[0]);
 		return (0);
 	}
 
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			return (0);
 		}
 		usage();
 	}
 
 	if (dump_opt['l'])
 		return (dump_label(argv[0]));
 
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		return (dump_path(argv[0], argv[1]));
 	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 	target = argv[0];
 
 	if (dump_opt['e']) {
 		nvlist_t *cfg = NULL;
 		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
 
 		error = ENOENT;
 		if (name) {
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_REWIND_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 			error = spa_import(name, cfg, NULL, flags);
 		}
 	}
 
 	if (strpbrk(target, "/@") != NULL) {
 		size_t targetlen;
 
 		target_is_spa = B_FALSE;
 		/*
 		 * Remove any trailing slash.  Later code would get confused
 		 * by it, but we want to allow it so that "pool/" can
 		 * indicate that we want to dump the topmost filesystem,
 		 * rather than the whole pool.
 		 */
 		targetlen = strlen(target);
 		if (targetlen != 0 && target[targetlen - 1] == '/')
 			target[targetlen - 1] = '\0';
 	}
 
 	if (error == 0) {
 		if (target_is_spa || dump_opt['R']) {
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else {
 			error = open_objset(target, DMU_OST_ANY, FTAG, &os);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	argv++;
 	argc--;
 	if (!dump_opt['R']) {
 		if (argc > 0) {
 			zopt_objects = argc;
 			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
 			for (i = 0; i < zopt_objects; i++) {
 				errno = 0;
 				zopt_object[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_object[i] == 0 && errno != 0)
 					fatal("bad number %s: %s",
 					    argv[i], strerror(errno));
 			}
 		}
 		if (os != NULL) {
 			dump_dir(os);
 		} else if (zopt_objects > 0 && !dump_opt['m']) {
 			dump_dir(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['p'] = ZDB_FLAG_PHYS;
 		flagbits['r'] = ZDB_FLAG_RAW;
 
 		for (i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	if (os != NULL)
 		close_objset(os, FTAG);
 	else
 		spa_close(spa, FTAG);
 
 	fuid_table_destroy();
 
 	dump_debug_buffer();
 
 	libzfs_fini(g_zfs);
 	kernel_fini();
 
 	return (0);
 }
Index: projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb
===================================================================
--- projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb	(revision 326161)
+++ projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb	(revision 326162)

Property changes on: projects/bsd_rdma_4_9/cddl/contrib/opensolaris/cmd/zdb
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/cddl/contrib/opensolaris/cmd/zdb:r325505-326161
Index: projects/bsd_rdma_4_9/cddl/contrib/opensolaris
===================================================================
--- projects/bsd_rdma_4_9/cddl/contrib/opensolaris	(revision 326161)
+++ projects/bsd_rdma_4_9/cddl/contrib/opensolaris	(revision 326162)

Property changes on: projects/bsd_rdma_4_9/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/cddl/contrib/opensolaris:r326132-326161
Index: projects/bsd_rdma_4_9/cddl
===================================================================
--- projects/bsd_rdma_4_9/cddl	(revision 326161)
+++ projects/bsd_rdma_4_9/cddl	(revision 326162)

Property changes on: projects/bsd_rdma_4_9/cddl
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/cddl:r326132-326161
Index: projects/bsd_rdma_4_9/contrib/binutils/bfd/ihex.c
===================================================================
--- projects/bsd_rdma_4_9/contrib/binutils/bfd/ihex.c	(revision 326161)
+++ projects/bsd_rdma_4_9/contrib/binutils/bfd/ihex.c	(revision 326162)
@@ -1,993 +1,993 @@
 /* BFD back-end for Intel Hex objects.
    Copyright 1995, 1996, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
    2006, 2007 Free Software Foundation, Inc.
    Written by Ian Lance Taylor of Cygnus Support <ian@cygnus.com>.
 
    This file is part of BFD, the Binary File Descriptor library.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
 
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
 
 /* This is what Intel Hex files look like:
 
 1. INTEL FORMATS
 
 A. Intel 1
 
    16-bit address-field format, for files 64k bytes in length or less.
 
    DATA RECORD
    Byte 1	Header = colon(:)
    2..3		The number of data bytes in hex notation
    4..5		High byte of the record load address
    6..7		Low byte of the record load address
    8..9		Record type, must be "00"
    10..x	Data bytes in hex notation:
 	x = (number of bytes - 1) * 2 + 11
    x+1..x+2	Checksum in hex notation
    x+3..x+4	Carriage return, line feed
 
    END RECORD
    Byte 1	Header = colon (:)
    2..3		The byte count, must be "00"
    4..7		Transfer-address (usually "0000")
 		the jump-to address, execution start address
    8..9		Record type, must be "01"
    10..11	Checksum, in hex notation
    12..13	Carriage return, line feed
 
 B. INTEL 2
 
    MCS-86 format, using a 20-bit address for files larger than 64K bytes.
 
    DATA RECORD
    Byte 1	Header = colon (:)
    2..3		The byte count of this record, hex notation
    4..5		High byte of the record load address
    6..7		Low byte of the record load address
    8..9		Record type, must be "00"
    10..x	The data bytes in hex notation:
 	x = (number of data bytes - 1) * 2 + 11
    x+1..x+2	Checksum in hex notation
    x+3..x+4	Carriage return, line feed
 
    EXTENDED ADDRESS RECORD
    Byte 1	Header = colon(:)
    2..3		The byte count, must be "02"
    4..7		Load address, must be "0000"
    8..9		Record type, must be "02"
    10..11	High byte of the offset address
    12..13	Low byte of the offset address
    14..15	Checksum in hex notation
    16..17	Carriage return, line feed
 
    The checksums are the two's complement of the 8-bit sum
    without carry of the byte count, offset address, and the
    record type.
 
    START ADDRESS RECORD
    Byte 1	Header = colon (:)
    2..3		The byte count, must be "04"
    4..7		Load address, must be "0000"
    8..9		Record type, must be "03"
    10..13	8086 CS value
    14..17	8086 IP value
    18..19	Checksum in hex notation
    20..21	Carriage return, line feed
 
 Another document reports these additional types:
 
    EXTENDED LINEAR ADDRESS RECORD
    Byte 1	Header = colon (:)
    2..3		The byte count, must be "02"
    4..7		Load address, must be "0000"
    8..9		Record type, must be "04"
    10..13	Upper 16 bits of address of subsequent records
    14..15	Checksum in hex notation
    16..17	Carriage return, line feed
 
    START LINEAR ADDRESS RECORD
    Byte 1	Header = colon (:)
    2..3		The byte count, must be "02"
    4..7		Load address, must be "0000"
    8..9		Record type, must be "05"
    10..13	Upper 16 bits of start address
    14..15	Checksum in hex notation
    16..17	Carriage return, line feed
 
 The MRI compiler uses this, which is a repeat of type 5:
 
   EXTENDED START RECORD
    Byte 1	Header = colon (:)
    2..3		The byte count, must be "04"
    4..7		Load address, must be "0000"
    8..9		Record type, must be "05"
    10..13	Upper 16 bits of start address
    14..17	Lower 16 bits of start address
    18..19	Checksum in hex notation
    20..21	Carriage return, line feed.  */
 
 #include "sysdep.h"
 #include "bfd.h"
 #include "libbfd.h"
 #include "libiberty.h"
 #include "safe-ctype.h"
 
 /* The number of bytes we put on one line during output.  */
 
 #define CHUNK 16
 
 /* Macros for converting between hex and binary.  */
 
 #define NIBBLE(x)    (hex_value (x))
 #define HEX2(buffer) ((NIBBLE ((buffer)[0]) << 4) + NIBBLE ((buffer)[1]))
 #define HEX4(buffer) ((HEX2 (buffer) << 8) + HEX2 ((buffer) + 2))
 #define ISHEX(x)     (hex_p (x))
 
 /* When we write out an ihex value, the values can not be output as
    they are seen.  Instead, we hold them in memory in this structure.  */
 
 struct ihex_data_list
 {
   struct ihex_data_list *next;
   bfd_byte *data;
   bfd_vma where;
   bfd_size_type size;
 };
 
 /* The ihex tdata information.  */
 
 struct ihex_data_struct
 {
   struct ihex_data_list *head;
   struct ihex_data_list *tail;
 };
 
 /* Initialize by filling in the hex conversion array.  */
 
 static void
 ihex_init (void)
 {
   static bfd_boolean inited;
 
   if (! inited)
     {
       inited = TRUE;
       hex_init ();
     }
 }
 
 /* Create an ihex object.  */
 
 static bfd_boolean
 ihex_mkobject (bfd *abfd)
 {
   struct ihex_data_struct *tdata;
 
   tdata = bfd_alloc (abfd, sizeof (* tdata));
   if (tdata == NULL)
     return FALSE;
 
   abfd->tdata.ihex_data = tdata;
   tdata->head = NULL;
   tdata->tail = NULL;
   return TRUE;
 }
 
 /* Read a byte from a BFD.  Set *ERRORPTR if an error occurred.
    Return EOF on error or end of file.  */
 
 static INLINE int
 ihex_get_byte (bfd *abfd, bfd_boolean *errorptr)
 {
   bfd_byte c;
 
   if (bfd_bread (&c, (bfd_size_type) 1, abfd) != 1)
     {
       if (bfd_get_error () != bfd_error_file_truncated)
 	*errorptr = TRUE;
       return EOF;
     }
 
   return (int) (c & 0xff);
 }
 
 /* Report a problem in an Intel Hex file.  */
 
 static void
 ihex_bad_byte (bfd *abfd, unsigned int lineno, int c, bfd_boolean error)
 {
   if (c == EOF)
     {
       if (! error)
 	bfd_set_error (bfd_error_file_truncated);
     }
   else
     {
       char buf[10];
 
       if (! ISPRINT (c))
 	sprintf (buf, "\\%03o", (unsigned int) c);
       else
 	{
 	  buf[0] = c;
 	  buf[1] = '\0';
 	}
       (*_bfd_error_handler)
 	(_("%B:%d: unexpected character `%s' in Intel Hex file"),
 	 abfd, lineno, buf);
       bfd_set_error (bfd_error_bad_value);
     }
 }
 
 /* Read an Intel hex file and turn it into sections.  We create a new
    section for each contiguous set of bytes.  */
 
 static bfd_boolean
 ihex_scan (bfd *abfd)
 {
   bfd_vma segbase;
   bfd_vma extbase;
   asection *sec;
   unsigned int lineno;
   bfd_boolean error;
   bfd_byte *buf = NULL;
   size_t bufsize;
   int c;
 
   if (bfd_seek (abfd, (file_ptr) 0, SEEK_SET) != 0)
     goto error_return;
 
   abfd->start_address = 0;
 
   segbase = 0;
   extbase = 0;
   sec = NULL;
   lineno = 1;
   error = FALSE;
   bufsize = 0;
 
   while ((c = ihex_get_byte (abfd, &error)) != EOF)
     {
       if (c == '\r')
 	continue;
       else if (c == '\n')
 	{
 	  ++lineno;
 	  continue;
 	}
       else if (c != ':')
 	{
 	  ihex_bad_byte (abfd, lineno, c, error);
 	  goto error_return;
 	}
       else
 	{
 	  file_ptr pos;
 	  char hdr[8];
 	  unsigned int i;
 	  unsigned int len;
 	  bfd_vma addr;
 	  unsigned int type;
 	  unsigned int chars;
 	  unsigned int chksum;
 
 	  /* This is a data record.  */
 	  pos = bfd_tell (abfd) - 1;
 
 	  /* Read the header bytes.  */
 	  if (bfd_bread (hdr, (bfd_size_type) 8, abfd) != 8)
 	    goto error_return;
 
 	  for (i = 0; i < 8; i++)
 	    {
 	      if (! ISHEX (hdr[i]))
 		{
 		  ihex_bad_byte (abfd, lineno, hdr[i], error);
 		  goto error_return;
 		}
 	    }
 
 	  len = HEX2 (hdr);
 	  addr = HEX4 (hdr + 2);
 	  type = HEX2 (hdr + 6);
 
 	  /* Read the data bytes.  */
 	  chars = len * 2 + 2;
 	  if (chars >= bufsize)
 	    {
 	      buf = bfd_realloc (buf, (bfd_size_type) chars);
 	      if (buf == NULL)
 		goto error_return;
 	      bufsize = chars;
 	    }
 
 	  if (bfd_bread (buf, (bfd_size_type) chars, abfd) != chars)
 	    goto error_return;
 
 	  for (i = 0; i < chars; i++)
 	    {
 	      if (! ISHEX (buf[i]))
 		{
-		  ihex_bad_byte (abfd, lineno, hdr[i], error);
+		  ihex_bad_byte (abfd, lineno, buf[i], error);
 		  goto error_return;
 		}
 	    }
 
 	  /* Check the checksum.  */
 	  chksum = len + addr + (addr >> 8) + type;
 	  for (i = 0; i < len; i++)
 	    chksum += HEX2 (buf + 2 * i);
 	  if (((- chksum) & 0xff) != (unsigned int) HEX2 (buf + 2 * i))
 	    {
 	      (*_bfd_error_handler)
 		(_("%B:%u: bad checksum in Intel Hex file (expected %u, found %u)"),
 		 abfd, lineno,
 		 (- chksum) & 0xff, (unsigned int) HEX2 (buf + 2 * i));
 	      bfd_set_error (bfd_error_bad_value);
 	      goto error_return;
 	    }
 
 	  switch (type)
 	    {
 	    case 0:
 	      /* This is a data record.  */
 	      if (sec != NULL
 		  && sec->vma + sec->size == extbase + segbase + addr)
 		{
 		  /* This data goes at the end of the section we are
                      currently building.  */
 		  sec->size += len;
 		}
 	      else if (len > 0)
 		{
 		  char secbuf[20];
 		  char *secname;
 		  bfd_size_type amt;
 		  flagword flags;
 
 		  sprintf (secbuf, ".sec%d", bfd_count_sections (abfd) + 1);
 		  amt = strlen (secbuf) + 1;
 		  secname = bfd_alloc (abfd, amt);
 		  if (secname == NULL)
 		    goto error_return;
 		  strcpy (secname, secbuf);
 		  flags = SEC_HAS_CONTENTS | SEC_LOAD | SEC_ALLOC;
 		  sec = bfd_make_section_with_flags (abfd, secname, flags);
 		  if (sec == NULL)
 		    goto error_return;
 		  sec->vma = extbase + segbase + addr;
 		  sec->lma = extbase + segbase + addr;
 		  sec->size = len;
 		  sec->filepos = pos;
 		}
 	      break;
 
 	    case 1:
 	      /* An end record.  */
 	      if (abfd->start_address == 0)
 		abfd->start_address = addr;
 	      if (buf != NULL)
 		free (buf);
 	      return TRUE;
 
 	    case 2:
 	      /* An extended address record.  */
 	      if (len != 2)
 		{
 		  (*_bfd_error_handler)
 		    (_("%B:%u: bad extended address record length in Intel Hex file"),
 		     abfd, lineno);
 		  bfd_set_error (bfd_error_bad_value);
 		  goto error_return;
 		}
 
 	      segbase = HEX4 (buf) << 4;
 
 	      sec = NULL;
 
 	      break;
 
 	    case 3:
 	      /* An extended start address record.  */
 	      if (len != 4)
 		{
 		  (*_bfd_error_handler)
 		    (_("%B:%u: bad extended start address length in Intel Hex file"),
 		     abfd, lineno);
 		  bfd_set_error (bfd_error_bad_value);
 		  goto error_return;
 		}
 
 	      abfd->start_address += (HEX4 (buf) << 4) + HEX4 (buf + 4);
 
 	      sec = NULL;
 
 	      break;
 
 	    case 4:
 	      /* An extended linear address record.  */
 	      if (len != 2)
 		{
 		  (*_bfd_error_handler)
 		    (_("%B:%u: bad extended linear address record length in Intel Hex file"),
 		     abfd, lineno);
 		  bfd_set_error (bfd_error_bad_value);
 		  goto error_return;
 		}
 
 	      extbase = HEX4 (buf) << 16;
 
 	      sec = NULL;
 
 	      break;
 
 	    case 5:
 	      /* An extended linear start address record.  */
 	      if (len != 2 && len != 4)
 		{
 		  (*_bfd_error_handler)
 		    (_("%B:%u: bad extended linear start address length in Intel Hex file"),
 		     abfd, lineno);
 		  bfd_set_error (bfd_error_bad_value);
 		  goto error_return;
 		}
 
 	      if (len == 2)
 		abfd->start_address += HEX4 (buf) << 16;
 	      else
 		abfd->start_address = (HEX4 (buf) << 16) + HEX4 (buf + 4);
 
 	      sec = NULL;
 
 	      break;
 
 	    default:
 	      (*_bfd_error_handler)
 		(_("%B:%u: unrecognized ihex type %u in Intel Hex file"),
 		 abfd, lineno, type);
 	      bfd_set_error (bfd_error_bad_value);
 	      goto error_return;
 	    }
 	}
     }
 
   if (error)
     goto error_return;
 
   if (buf != NULL)
     free (buf);
 
   return TRUE;
 
  error_return:
   if (buf != NULL)
     free (buf);
   return FALSE;
 }
 
 /* Try to recognize an Intel Hex file.  */
 
 static const bfd_target *
 ihex_object_p (bfd *abfd)
 {
   void * tdata_save;
   bfd_byte b[9];
   unsigned int i;
   unsigned int type;
 
   ihex_init ();
 
   if (bfd_seek (abfd, (file_ptr) 0, SEEK_SET) != 0)
     return NULL;
   if (bfd_bread (b, (bfd_size_type) 9, abfd) != 9)
     {
       if (bfd_get_error () == bfd_error_file_truncated)
 	bfd_set_error (bfd_error_wrong_format);
       return NULL;
     }
 
   if (b[0] != ':')
     {
       bfd_set_error (bfd_error_wrong_format);
       return NULL;
     }
 
   for (i = 1; i < 9; i++)
     {
       if (! ISHEX (b[i]))
 	{
 	  bfd_set_error (bfd_error_wrong_format);
 	  return NULL;
 	}
     }
 
   type = HEX2 (b + 7);
   if (type > 5)
     {
       bfd_set_error (bfd_error_wrong_format);
       return NULL;
     }
 
   /* OK, it looks like it really is an Intel Hex file.  */
   tdata_save = abfd->tdata.any;
   if (! ihex_mkobject (abfd) || ! ihex_scan (abfd))
     {
       if (abfd->tdata.any != tdata_save && abfd->tdata.any != NULL)
 	bfd_release (abfd, abfd->tdata.any);
       abfd->tdata.any = tdata_save;
       return NULL;
     }
 
   return abfd->xvec;
 }
 
 /* Read the contents of a section in an Intel Hex file.  */
 
 static bfd_boolean
 ihex_read_section (bfd *abfd, asection *section, bfd_byte *contents)
 {
   int c;
   bfd_byte *p;
   bfd_byte *buf = NULL;
   size_t bufsize;
   bfd_boolean error;
 
   if (bfd_seek (abfd, section->filepos, SEEK_SET) != 0)
     goto error_return;
 
   p = contents;
   bufsize = 0;
   error = FALSE;
   while ((c = ihex_get_byte (abfd, &error)) != EOF)
     {
       char hdr[8];
       unsigned int len;
       unsigned int type;
       unsigned int i;
 
       if (c == '\r' || c == '\n')
 	continue;
 
       /* This is called after ihex_scan has succeeded, so we ought to
          know the exact format.  */
       BFD_ASSERT (c == ':');
 
       if (bfd_bread (hdr, (bfd_size_type) 8, abfd) != 8)
 	goto error_return;
 
       len = HEX2 (hdr);
       type = HEX2 (hdr + 6);
 
       /* We should only see type 0 records here.  */
       if (type != 0)
 	{
 	  (*_bfd_error_handler)
 	    (_("%B: internal error in ihex_read_section"), abfd);
 	  bfd_set_error (bfd_error_bad_value);
 	  goto error_return;
 	}
 
       if (len * 2 > bufsize)
 	{
 	  buf = bfd_realloc (buf, (bfd_size_type) len * 2);
 	  if (buf == NULL)
 	    goto error_return;
 	  bufsize = len * 2;
 	}
 
       if (bfd_bread (buf, (bfd_size_type) len * 2, abfd) != len * 2)
 	goto error_return;
 
       for (i = 0; i < len; i++)
 	*p++ = HEX2 (buf + 2 * i);
       if ((bfd_size_type) (p - contents) >= section->size)
 	{
 	  /* We've read everything in the section.  */
 	  if (buf != NULL)
 	    free (buf);
 	  return TRUE;
 	}
 
       /* Skip the checksum.  */
       if (bfd_bread (buf, (bfd_size_type) 2, abfd) != 2)
 	goto error_return;
     }
 
   if ((bfd_size_type) (p - contents) < section->size)
     {
       (*_bfd_error_handler)
 	(_("%B: bad section length in ihex_read_section"), abfd);
       bfd_set_error (bfd_error_bad_value);
       goto error_return;
     }
 
   if (buf != NULL)
     free (buf);
 
   return TRUE;
 
  error_return:
   if (buf != NULL)
     free (buf);
   return FALSE;
 }
 
 /* Get the contents of a section in an Intel Hex file.  */
 
 static bfd_boolean
 ihex_get_section_contents (bfd *abfd,
 			   asection *section,
 			   void * location,
 			   file_ptr offset,
 			   bfd_size_type count)
 {
   if (section->used_by_bfd == NULL)
     {
       section->used_by_bfd = bfd_alloc (abfd, section->size);
       if (section->used_by_bfd == NULL)
 	return FALSE;
       if (! ihex_read_section (abfd, section, section->used_by_bfd))
 	return FALSE;
     }
 
   memcpy (location, (bfd_byte *) section->used_by_bfd + offset,
 	  (size_t) count);
 
   return TRUE;
 }
 
 /* Set the contents of a section in an Intel Hex file.  */
 
 static bfd_boolean
 ihex_set_section_contents (bfd *abfd,
 			   asection *section,
 			   const void * location,
 			   file_ptr offset,
 			   bfd_size_type count)
 {
   struct ihex_data_list *n;
   bfd_byte *data;
   struct ihex_data_struct *tdata;
 
   if (count == 0
       || (section->flags & SEC_ALLOC) == 0
       || (section->flags & SEC_LOAD) == 0)
     return TRUE;
 
   n = bfd_alloc (abfd, sizeof (* n));
   if (n == NULL)
     return FALSE;
 
   data = bfd_alloc (abfd, count);
   if (data == NULL)
     return FALSE;
   memcpy (data, location, (size_t) count);
 
   n->data = data;
   n->where = section->lma + offset;
   n->size = count;
 
   /* Sort the records by address.  Optimize for the common case of
      adding a record to the end of the list.  */
   tdata = abfd->tdata.ihex_data;
   if (tdata->tail != NULL
       && n->where >= tdata->tail->where)
     {
       tdata->tail->next = n;
       n->next = NULL;
       tdata->tail = n;
     }
   else
     {
       struct ihex_data_list **pp;
 
       for (pp = &tdata->head;
 	   *pp != NULL && (*pp)->where < n->where;
 	   pp = &(*pp)->next)
 	;
       n->next = *pp;
       *pp = n;
       if (n->next == NULL)
 	tdata->tail = n;
     }
 
   return TRUE;
 }
 
 /* Write a record out to an Intel Hex file.  */
 
 static bfd_boolean
 ihex_write_record (bfd *abfd,
 		   size_t count,
 		   unsigned int addr,
 		   unsigned int type,
 		   bfd_byte *data)
 {
   static const char digs[] = "0123456789ABCDEF";
   char buf[9 + CHUNK * 2 + 4];
   char *p;
   unsigned int chksum;
   unsigned int i;
   size_t total;
 
 #define TOHEX(buf, v) \
   ((buf)[0] = digs[((v) >> 4) & 0xf], (buf)[1] = digs[(v) & 0xf])
 
   buf[0] = ':';
   TOHEX (buf + 1, count);
   TOHEX (buf + 3, (addr >> 8) & 0xff);
   TOHEX (buf + 5, addr & 0xff);
   TOHEX (buf + 7, type);
 
   chksum = count + addr + (addr >> 8) + type;
 
   for (i = 0, p = buf + 9; i < count; i++, p += 2, data++)
     {
       TOHEX (p, *data);
       chksum += *data;
     }
 
   TOHEX (p, (- chksum) & 0xff);
   p[2] = '\r';
   p[3] = '\n';
 
   total = 9 + count * 2 + 4;
   if (bfd_bwrite (buf, (bfd_size_type) total, abfd) != total)
     return FALSE;
 
   return TRUE;
 }
 
 /* Write out an Intel Hex file.  */
 
 static bfd_boolean
 ihex_write_object_contents (bfd *abfd)
 {
   bfd_vma segbase;
   bfd_vma extbase;
   struct ihex_data_list *l;
 
   segbase = 0;
   extbase = 0;
   for (l = abfd->tdata.ihex_data->head; l != NULL; l = l->next)
     {
       bfd_vma where;
       bfd_byte *p;
       bfd_size_type count;
 
       where = l->where;
       p = l->data;
       count = l->size;
 
       while (count > 0)
 	{
 	  size_t now;
 	  unsigned int rec_addr;
 
 	  now = count;
 	  if (count > CHUNK)
 	    now = CHUNK;
 
 	  if (where > segbase + extbase + 0xffff)
 	    {
 	      bfd_byte addr[2];
 
 	      /* We need a new base address.  */
 	      if (where <= 0xfffff)
 		{
 		  /* The addresses should be sorted.  */
 		  BFD_ASSERT (extbase == 0);
 
 		  segbase = where & 0xf0000;
 		  addr[0] = (bfd_byte)(segbase >> 12) & 0xff;
 		  addr[1] = (bfd_byte)(segbase >> 4) & 0xff;
 		  if (! ihex_write_record (abfd, 2, 0, 2, addr))
 		    return FALSE;
 		}
 	      else
 		{
 		  /* The extended address record and the extended
                      linear address record are combined, at least by
                      some readers.  We need an extended linear address
                      record here, so if we've already written out an
                      extended address record, zero it out to avoid
                      confusion.  */
 		  if (segbase != 0)
 		    {
 		      addr[0] = 0;
 		      addr[1] = 0;
 		      if (! ihex_write_record (abfd, 2, 0, 2, addr))
 			return FALSE;
 		      segbase = 0;
 		    }
 
 		  extbase = where & 0xffff0000;
 		  if (where > extbase + 0xffff)
 		    {
 		      char buf[20];
 
 		      sprintf_vma (buf, where);
 		      (*_bfd_error_handler)
 			(_("%s: address 0x%s out of range for Intel Hex file"),
 			 bfd_get_filename (abfd), buf);
 		      bfd_set_error (bfd_error_bad_value);
 		      return FALSE;
 		    }
 		  addr[0] = (bfd_byte)(extbase >> 24) & 0xff;
 		  addr[1] = (bfd_byte)(extbase >> 16) & 0xff;
 		  if (! ihex_write_record (abfd, 2, 0, 4, addr))
 		    return FALSE;
 		}
 	    }
 
 	  rec_addr = where - (extbase + segbase);
 
           /* Output records shouldn't cross 64K boundaries.  */
           if (rec_addr + now > 0xffff)
             now = 0x10000 - rec_addr;
 
 	  if (! ihex_write_record (abfd, now, rec_addr, 0, p))
 	    return FALSE;
 
 	  where += now;
 	  p += now;
 	  count -= now;
 	}
     }
 
   if (abfd->start_address != 0)
     {
       bfd_vma start;
       bfd_byte startbuf[4];
 
       start = abfd->start_address;
 
       if (start <= 0xfffff)
 	{
 	  startbuf[0] = (bfd_byte)((start & 0xf0000) >> 12) & 0xff;
 	  startbuf[1] = 0;
 	  startbuf[2] = (bfd_byte)(start >> 8) & 0xff;
 	  startbuf[3] = (bfd_byte)start & 0xff;
 	  if (! ihex_write_record (abfd, 4, 0, 3, startbuf))
 	    return FALSE;
 	}
       else
 	{
 	  startbuf[0] = (bfd_byte)(start >> 24) & 0xff;
 	  startbuf[1] = (bfd_byte)(start >> 16) & 0xff;
 	  startbuf[2] = (bfd_byte)(start >> 8) & 0xff;
 	  startbuf[3] = (bfd_byte)start & 0xff;
 	  if (! ihex_write_record (abfd, 4, 0, 5, startbuf))
 	    return FALSE;
 	}
     }
 
   if (! ihex_write_record (abfd, 0, 0, 1, NULL))
     return FALSE;
 
   return TRUE;
 }
 
 /* Set the architecture for the output file.  The architecture is
    irrelevant, so we ignore errors about unknown architectures.  */
 
 static bfd_boolean
 ihex_set_arch_mach (bfd *abfd,
 		    enum bfd_architecture arch,
 		    unsigned long mach)
 {
   if (! bfd_default_set_arch_mach (abfd, arch, mach))
     {
       if (arch != bfd_arch_unknown)
 	return FALSE;
     }
   return TRUE;
 }
 
 /* Get the size of the headers, for the linker.  */
 
 static int
 ihex_sizeof_headers (bfd *abfd ATTRIBUTE_UNUSED,
 		     struct bfd_link_info *info ATTRIBUTE_UNUSED)
 {
   return 0;
 }
 
 /* Some random definitions for the target vector.  */
 
 #define	ihex_close_and_cleanup                    _bfd_generic_close_and_cleanup
 #define ihex_bfd_free_cached_info                 _bfd_generic_bfd_free_cached_info
 #define ihex_new_section_hook                     _bfd_generic_new_section_hook
 #define ihex_get_section_contents_in_window       _bfd_generic_get_section_contents_in_window
 #define ihex_get_symtab_upper_bound               bfd_0l
 #define ihex_canonicalize_symtab                  ((long (*) (bfd *, asymbol **)) bfd_0l)
 #define ihex_make_empty_symbol                    _bfd_generic_make_empty_symbol
 #define ihex_print_symbol                         _bfd_nosymbols_print_symbol
 #define ihex_get_symbol_info                      _bfd_nosymbols_get_symbol_info
 #define ihex_bfd_is_target_special_symbol         ((bfd_boolean (*) (bfd *, asymbol *)) bfd_false)
 #define ihex_bfd_is_local_label_name              _bfd_nosymbols_bfd_is_local_label_name
 #define ihex_get_lineno                           _bfd_nosymbols_get_lineno
 #define ihex_find_nearest_line                    _bfd_nosymbols_find_nearest_line
 #define ihex_find_inliner_info                    _bfd_nosymbols_find_inliner_info
 #define ihex_bfd_make_debug_symbol                _bfd_nosymbols_bfd_make_debug_symbol
 #define ihex_read_minisymbols                     _bfd_nosymbols_read_minisymbols
 #define ihex_minisymbol_to_symbol                 _bfd_nosymbols_minisymbol_to_symbol
 #define ihex_bfd_get_relocated_section_contents   bfd_generic_get_relocated_section_contents
 #define ihex_bfd_relax_section                    bfd_generic_relax_section
 #define ihex_bfd_gc_sections                      bfd_generic_gc_sections
 #define ihex_bfd_merge_sections                   bfd_generic_merge_sections
 #define ihex_bfd_is_group_section                 bfd_generic_is_group_section
 #define ihex_bfd_discard_group                    bfd_generic_discard_group
 #define ihex_section_already_linked               _bfd_generic_section_already_linked
 #define ihex_bfd_link_hash_table_create           _bfd_generic_link_hash_table_create
 #define ihex_bfd_link_hash_table_free             _bfd_generic_link_hash_table_free
 #define ihex_bfd_link_add_symbols                 _bfd_generic_link_add_symbols
 #define ihex_bfd_link_just_syms                   _bfd_generic_link_just_syms
 #define ihex_bfd_final_link                       _bfd_generic_final_link
 #define ihex_bfd_link_split_section               _bfd_generic_link_split_section
 
 /* The Intel Hex target vector.  */
 
 const bfd_target ihex_vec =
 {
   "ihex",			/* Name.  */
   bfd_target_ihex_flavour,
   BFD_ENDIAN_UNKNOWN,		/* Target byte order.  */
   BFD_ENDIAN_UNKNOWN,		/* Target headers byte order.  */
   0,				/* Object flags.  */
   (SEC_HAS_CONTENTS | SEC_ALLOC | SEC_LOAD),	/* Section flags.  */
   0,				/* Leading underscore.  */
   ' ',				/* AR_pad_char.  */
   16,				/* AR_max_namelen.  */
   bfd_getb64, bfd_getb_signed_64, bfd_putb64,
   bfd_getb32, bfd_getb_signed_32, bfd_putb32,
   bfd_getb16, bfd_getb_signed_16, bfd_putb16,	/* Data.  */
   bfd_getb64, bfd_getb_signed_64, bfd_putb64,
   bfd_getb32, bfd_getb_signed_32, bfd_putb32,
   bfd_getb16, bfd_getb_signed_16, bfd_putb16,	/* Headers. */
 
   {
     _bfd_dummy_target,
     ihex_object_p,		/* bfd_check_format.  */
     _bfd_dummy_target,
     _bfd_dummy_target,
   },
   {
     bfd_false,
     ihex_mkobject,
     _bfd_generic_mkarchive,
     bfd_false,
   },
   {				/* bfd_write_contents.  */
     bfd_false,
     ihex_write_object_contents,
     _bfd_write_archive_contents,
     bfd_false,
   },
 
   BFD_JUMP_TABLE_GENERIC (ihex),
   BFD_JUMP_TABLE_COPY (_bfd_generic),
   BFD_JUMP_TABLE_CORE (_bfd_nocore),
   BFD_JUMP_TABLE_ARCHIVE (_bfd_noarchive),
   BFD_JUMP_TABLE_SYMBOLS (ihex),
   BFD_JUMP_TABLE_RELOCS (_bfd_norelocs),
   BFD_JUMP_TABLE_WRITE (ihex),
   BFD_JUMP_TABLE_LINK (ihex),
   BFD_JUMP_TABLE_DYNAMIC (_bfd_nodynamic),
 
   NULL,
 
   NULL
 };
Index: projects/bsd_rdma_4_9/contrib/binutils/bfd/peXXigen.c
===================================================================
--- projects/bsd_rdma_4_9/contrib/binutils/bfd/peXXigen.c	(revision 326161)
+++ projects/bsd_rdma_4_9/contrib/binutils/bfd/peXXigen.c	(revision 326162)
@@ -1,2173 +1,2193 @@
 /* Support for the generic parts of PE/PEI; the common executable parts.
    Copyright 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
    2005, 2006, 2007 Free Software Foundation, Inc.
    Written by Cygnus Solutions.
 
    This file is part of BFD, the Binary File Descriptor library.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
 
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
 
 /* Most of this hacked by Steve Chamberlain <sac@cygnus.com>.
 
    PE/PEI rearrangement (and code added): Donn Terry
 					  Softway Systems, Inc.  */
 
 /* Hey look, some documentation [and in a place you expect to find it]!
 
    The main reference for the pei format is "Microsoft Portable Executable
    and Common Object File Format Specification 4.1".  Get it if you need to
    do some serious hacking on this code.
 
    Another reference:
    "Peering Inside the PE: A Tour of the Win32 Portable Executable
    File Format", MSJ 1994, Volume 9.
 
    The *sole* difference between the pe format and the pei format is that the
    latter has an MSDOS 2.0 .exe header on the front that prints the message
    "This app must be run under Windows." (or some such).
    (FIXME: Whether that statement is *really* true or not is unknown.
    Are there more subtle differences between pe and pei formats?
    For now assume there aren't.  If you find one, then for God sakes
    document it here!)
 
    The Microsoft docs use the word "image" instead of "executable" because
    the former can also refer to a DLL (shared library).  Confusion can arise
    because the `i' in `pei' also refers to "image".  The `pe' format can
    also create images (i.e. executables), it's just that to run on a win32
    system you need to use the pei format.
 
    FIXME: Please add more docs here so the next poor fool that has to hack
    on this code has a chance of getting something accomplished without
    wasting too much time.  */
 
 /* This expands into COFF_WITH_pe, COFF_WITH_pep, or COFF_WITH_pex64
    depending on whether we're compiling for straight PE or PE+.  */
 #define COFF_WITH_XX
 
 #include "sysdep.h"
 #include "bfd.h"
 #include "libbfd.h"
 #include "coff/internal.h"
 
 /* NOTE: it's strange to be including an architecture specific header
    in what's supposed to be general (to PE/PEI) code.  However, that's
    where the definitions are, and they don't vary per architecture
    within PE/PEI, so we get them from there.  FIXME: The lack of
    variance is an assumption which may prove to be incorrect if new
    PE/PEI targets are created.  */
 #if defined COFF_WITH_pex64
 # include "coff/x86_64.h"
 #elif defined COFF_WITH_pep
 # include "coff/ia64.h"
 #else
 # include "coff/i386.h"
 #endif
 
 #include "coff/pe.h"
 #include "libcoff.h"
 #include "libpei.h"
 
 #if defined COFF_WITH_pep || defined COFF_WITH_pex64
 # undef AOUTSZ
 # define AOUTSZ		PEPAOUTSZ
 # define PEAOUTHDR	PEPAOUTHDR
 #endif
 
 /* FIXME: This file has various tests of POWERPC_LE_PE.  Those tests
    worked when the code was in peicode.h, but no longer work now that
    the code is in peigen.c.  PowerPC NT is said to be dead.  If
    anybody wants to revive the code, you will have to figure out how
    to handle those issues.  */
 
 void
 _bfd_XXi_swap_sym_in (bfd * abfd, void * ext1, void * in1)
 {
   SYMENT *ext = (SYMENT *) ext1;
   struct internal_syment *in = (struct internal_syment *) in1;
 
   if (ext->e.e_name[0] == 0)
     {
       in->_n._n_n._n_zeroes = 0;
       in->_n._n_n._n_offset = H_GET_32 (abfd, ext->e.e.e_offset);
     }
   else
     memcpy (in->_n._n_name, ext->e.e_name, SYMNMLEN);
 
   in->n_value = H_GET_32 (abfd, ext->e_value);
   in->n_scnum = H_GET_16 (abfd, ext->e_scnum);
 
   if (sizeof (ext->e_type) == 2)
     in->n_type = H_GET_16 (abfd, ext->e_type);
   else
     in->n_type = H_GET_32 (abfd, ext->e_type);
 
   in->n_sclass = H_GET_8 (abfd, ext->e_sclass);
   in->n_numaux = H_GET_8 (abfd, ext->e_numaux);
 
 #ifndef STRICT_PE_FORMAT
   /* This is for Gnu-created DLLs.  */
 
   /* The section symbols for the .idata$ sections have class 0x68
      (C_SECTION), which MS documentation indicates is a section
      symbol.  Unfortunately, the value field in the symbol is simply a
      copy of the .idata section's flags rather than something useful.
      When these symbols are encountered, change the value to 0 so that
      they will be handled somewhat correctly in the bfd code.  */
   if (in->n_sclass == C_SECTION)
     {
       in->n_value = 0x0;
 
       /* Create synthetic empty sections as needed.  DJ */
       if (in->n_scnum == 0)
 	{
 	  asection *sec;
 
 	  for (sec = abfd->sections; sec; sec = sec->next)
 	    {
 	      if (strcmp (sec->name, in->n_name) == 0)
 		{
 		  in->n_scnum = sec->target_index;
 		  break;
 		}
 	    }
 	}
 
       if (in->n_scnum == 0)
 	{
 	  int unused_section_number = 0;
 	  asection *sec;
 	  char *name;
 	  flagword flags;
 
 	  for (sec = abfd->sections; sec; sec = sec->next)
 	    if (unused_section_number <= sec->target_index)
 	      unused_section_number = sec->target_index + 1;
 
 	  name = bfd_alloc (abfd, (bfd_size_type) strlen (in->n_name) + 10);
 	  if (name == NULL)
 	    return;
 	  strcpy (name, in->n_name);
 	  flags = SEC_HAS_CONTENTS | SEC_ALLOC | SEC_DATA | SEC_LOAD;
 	  sec = bfd_make_section_anyway_with_flags (abfd, name, flags);
 
 	  sec->vma = 0;
 	  sec->lma = 0;
 	  sec->size = 0;
 	  sec->filepos = 0;
 	  sec->rel_filepos = 0;
 	  sec->reloc_count = 0;
 	  sec->line_filepos = 0;
 	  sec->lineno_count = 0;
 	  sec->userdata = NULL;
 	  sec->next = NULL;
 	  sec->alignment_power = 2;
 
 	  sec->target_index = unused_section_number;
 
 	  in->n_scnum = unused_section_number;
 	}
       in->n_sclass = C_STAT;
     }
 #endif
 
 #ifdef coff_swap_sym_in_hook
   /* This won't work in peigen.c, but since it's for PPC PE, it's not
      worth fixing.  */
   coff_swap_sym_in_hook (abfd, ext1, in1);
 #endif
 }
 
 unsigned int
 _bfd_XXi_swap_sym_out (bfd * abfd, void * inp, void * extp)
 {
   struct internal_syment *in = (struct internal_syment *) inp;
   SYMENT *ext = (SYMENT *) extp;
 
   if (in->_n._n_name[0] == 0)
     {
       H_PUT_32 (abfd, 0, ext->e.e.e_zeroes);
       H_PUT_32 (abfd, in->_n._n_n._n_offset, ext->e.e.e_offset);
     }
   else
     memcpy (ext->e.e_name, in->_n._n_name, SYMNMLEN);
 
   H_PUT_32 (abfd, in->n_value, ext->e_value);
   H_PUT_16 (abfd, in->n_scnum, ext->e_scnum);
 
   if (sizeof (ext->e_type) == 2)
     H_PUT_16 (abfd, in->n_type, ext->e_type);
   else
     H_PUT_32 (abfd, in->n_type, ext->e_type);
 
   H_PUT_8 (abfd, in->n_sclass, ext->e_sclass);
   H_PUT_8 (abfd, in->n_numaux, ext->e_numaux);
 
   return SYMESZ;
 }
 
 void
 _bfd_XXi_swap_aux_in (bfd *	abfd,
 		      void *	ext1,
 		      int       type,
 		      int       class,
 		      int	indx ATTRIBUTE_UNUSED,
 		      int	numaux ATTRIBUTE_UNUSED,
 		      void * 	in1)
 {
   AUXENT *ext = (AUXENT *) ext1;
   union internal_auxent *in = (union internal_auxent *) in1;
 
   switch (class)
     {
     case C_FILE:
       if (ext->x_file.x_fname[0] == 0)
 	{
 	  in->x_file.x_n.x_zeroes = 0;
 	  in->x_file.x_n.x_offset = H_GET_32 (abfd, ext->x_file.x_n.x_offset);
 	}
       else
 	memcpy (in->x_file.x_fname, ext->x_file.x_fname, FILNMLEN);
       return;
 
     case C_STAT:
     case C_LEAFSTAT:
     case C_HIDDEN:
       if (type == T_NULL)
 	{
 	  in->x_scn.x_scnlen = GET_SCN_SCNLEN (abfd, ext);
 	  in->x_scn.x_nreloc = GET_SCN_NRELOC (abfd, ext);
 	  in->x_scn.x_nlinno = GET_SCN_NLINNO (abfd, ext);
 	  in->x_scn.x_checksum = H_GET_32 (abfd, ext->x_scn.x_checksum);
 	  in->x_scn.x_associated = H_GET_16 (abfd, ext->x_scn.x_associated);
 	  in->x_scn.x_comdat = H_GET_8 (abfd, ext->x_scn.x_comdat);
 	  return;
 	}
       break;
     }
 
   in->x_sym.x_tagndx.l = H_GET_32 (abfd, ext->x_sym.x_tagndx);
   in->x_sym.x_tvndx = H_GET_16 (abfd, ext->x_sym.x_tvndx);
 
   if (class == C_BLOCK || class == C_FCN || ISFCN (type) || ISTAG (class))
     {
       in->x_sym.x_fcnary.x_fcn.x_lnnoptr = GET_FCN_LNNOPTR (abfd, ext);
       in->x_sym.x_fcnary.x_fcn.x_endndx.l = GET_FCN_ENDNDX (abfd, ext);
     }
   else
     {
       in->x_sym.x_fcnary.x_ary.x_dimen[0] =
 	H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[0]);
       in->x_sym.x_fcnary.x_ary.x_dimen[1] =
 	H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[1]);
       in->x_sym.x_fcnary.x_ary.x_dimen[2] =
 	H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[2]);
       in->x_sym.x_fcnary.x_ary.x_dimen[3] =
 	H_GET_16 (abfd, ext->x_sym.x_fcnary.x_ary.x_dimen[3]);
     }
 
   if (ISFCN (type))
     {
       in->x_sym.x_misc.x_fsize = H_GET_32 (abfd, ext->x_sym.x_misc.x_fsize);
     }
   else
     {
       in->x_sym.x_misc.x_lnsz.x_lnno = GET_LNSZ_LNNO (abfd, ext);
       in->x_sym.x_misc.x_lnsz.x_size = GET_LNSZ_SIZE (abfd, ext);
     }
 }
 
 unsigned int
 _bfd_XXi_swap_aux_out (bfd *  abfd,
 		       void * inp,
 		       int    type,
 		       int    class,
 		       int    indx ATTRIBUTE_UNUSED,
 		       int    numaux ATTRIBUTE_UNUSED,
 		       void * extp)
 {
   union internal_auxent *in = (union internal_auxent *) inp;
   AUXENT *ext = (AUXENT *) extp;
 
   memset (ext, 0, AUXESZ);
 
   switch (class)
     {
     case C_FILE:
       if (in->x_file.x_fname[0] == 0)
 	{
 	  H_PUT_32 (abfd, 0, ext->x_file.x_n.x_zeroes);
 	  H_PUT_32 (abfd, in->x_file.x_n.x_offset, ext->x_file.x_n.x_offset);
 	}
       else
 	memcpy (ext->x_file.x_fname, in->x_file.x_fname, FILNMLEN);
 
       return AUXESZ;
 
     case C_STAT:
     case C_LEAFSTAT:
     case C_HIDDEN:
       if (type == T_NULL)
 	{
 	  PUT_SCN_SCNLEN (abfd, in->x_scn.x_scnlen, ext);
 	  PUT_SCN_NRELOC (abfd, in->x_scn.x_nreloc, ext);
 	  PUT_SCN_NLINNO (abfd, in->x_scn.x_nlinno, ext);
 	  H_PUT_32 (abfd, in->x_scn.x_checksum, ext->x_scn.x_checksum);
 	  H_PUT_16 (abfd, in->x_scn.x_associated, ext->x_scn.x_associated);
 	  H_PUT_8 (abfd, in->x_scn.x_comdat, ext->x_scn.x_comdat);
 	  return AUXESZ;
 	}
       break;
     }
 
   H_PUT_32 (abfd, in->x_sym.x_tagndx.l, ext->x_sym.x_tagndx);
   H_PUT_16 (abfd, in->x_sym.x_tvndx, ext->x_sym.x_tvndx);
 
   if (class == C_BLOCK || class == C_FCN || ISFCN (type) || ISTAG (class))
     {
       PUT_FCN_LNNOPTR (abfd, in->x_sym.x_fcnary.x_fcn.x_lnnoptr,  ext);
       PUT_FCN_ENDNDX  (abfd, in->x_sym.x_fcnary.x_fcn.x_endndx.l, ext);
     }
   else
     {
       H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[0],
 		ext->x_sym.x_fcnary.x_ary.x_dimen[0]);
       H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[1],
 		ext->x_sym.x_fcnary.x_ary.x_dimen[1]);
       H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[2],
 		ext->x_sym.x_fcnary.x_ary.x_dimen[2]);
       H_PUT_16 (abfd, in->x_sym.x_fcnary.x_ary.x_dimen[3],
 		ext->x_sym.x_fcnary.x_ary.x_dimen[3]);
     }
 
   if (ISFCN (type))
     H_PUT_32 (abfd, in->x_sym.x_misc.x_fsize, ext->x_sym.x_misc.x_fsize);
   else
     {
       PUT_LNSZ_LNNO (abfd, in->x_sym.x_misc.x_lnsz.x_lnno, ext);
       PUT_LNSZ_SIZE (abfd, in->x_sym.x_misc.x_lnsz.x_size, ext);
     }
 
   return AUXESZ;
 }
 
 void
 _bfd_XXi_swap_lineno_in (bfd * abfd, void * ext1, void * in1)
 {
   LINENO *ext = (LINENO *) ext1;
   struct internal_lineno *in = (struct internal_lineno *) in1;
 
   in->l_addr.l_symndx = H_GET_32 (abfd, ext->l_addr.l_symndx);
   in->l_lnno = GET_LINENO_LNNO (abfd, ext);
 }
 
 unsigned int
 _bfd_XXi_swap_lineno_out (bfd * abfd, void * inp, void * outp)
 {
   struct internal_lineno *in = (struct internal_lineno *) inp;
   struct external_lineno *ext = (struct external_lineno *) outp;
   H_PUT_32 (abfd, in->l_addr.l_symndx, ext->l_addr.l_symndx);
 
   PUT_LINENO_LNNO (abfd, in->l_lnno, ext);
   return LINESZ;
 }
 
 void
 _bfd_XXi_swap_aouthdr_in (bfd * abfd,
 			  void * aouthdr_ext1,
 			  void * aouthdr_int1)
 {
   PEAOUTHDR * src = (PEAOUTHDR *) aouthdr_ext1;
   AOUTHDR * aouthdr_ext = (AOUTHDR *) aouthdr_ext1;
   struct internal_aouthdr *aouthdr_int
     = (struct internal_aouthdr *) aouthdr_int1;
   struct internal_extra_pe_aouthdr *a = &aouthdr_int->pe;
 
   aouthdr_int->magic = H_GET_16 (abfd, aouthdr_ext->magic);
   aouthdr_int->vstamp = H_GET_16 (abfd, aouthdr_ext->vstamp);
   aouthdr_int->tsize = GET_AOUTHDR_TSIZE (abfd, aouthdr_ext->tsize);
   aouthdr_int->dsize = GET_AOUTHDR_DSIZE (abfd, aouthdr_ext->dsize);
   aouthdr_int->bsize = GET_AOUTHDR_BSIZE (abfd, aouthdr_ext->bsize);
   aouthdr_int->entry = GET_AOUTHDR_ENTRY (abfd, aouthdr_ext->entry);
   aouthdr_int->text_start =
     GET_AOUTHDR_TEXT_START (abfd, aouthdr_ext->text_start);
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
   /* PE32+ does not have data_start member!  */
   aouthdr_int->data_start =
     GET_AOUTHDR_DATA_START (abfd, aouthdr_ext->data_start);
   a->BaseOfData = aouthdr_int->data_start;
 #endif
 
   a->Magic = aouthdr_int->magic;
   a->MajorLinkerVersion = H_GET_8 (abfd, aouthdr_ext->vstamp);
   a->MinorLinkerVersion = H_GET_8 (abfd, aouthdr_ext->vstamp + 1);
   a->SizeOfCode = aouthdr_int->tsize ;
   a->SizeOfInitializedData = aouthdr_int->dsize ;
   a->SizeOfUninitializedData = aouthdr_int->bsize ;
   a->AddressOfEntryPoint = aouthdr_int->entry;
   a->BaseOfCode = aouthdr_int->text_start;
   a->ImageBase = GET_OPTHDR_IMAGE_BASE (abfd, src->ImageBase);
   a->SectionAlignment = H_GET_32 (abfd, src->SectionAlignment);
   a->FileAlignment = H_GET_32 (abfd, src->FileAlignment);
   a->MajorOperatingSystemVersion =
     H_GET_16 (abfd, src->MajorOperatingSystemVersion);
   a->MinorOperatingSystemVersion =
     H_GET_16 (abfd, src->MinorOperatingSystemVersion);
   a->MajorImageVersion = H_GET_16 (abfd, src->MajorImageVersion);
   a->MinorImageVersion = H_GET_16 (abfd, src->MinorImageVersion);
   a->MajorSubsystemVersion = H_GET_16 (abfd, src->MajorSubsystemVersion);
   a->MinorSubsystemVersion = H_GET_16 (abfd, src->MinorSubsystemVersion);
   a->Reserved1 = H_GET_32 (abfd, src->Reserved1);
   a->SizeOfImage = H_GET_32 (abfd, src->SizeOfImage);
   a->SizeOfHeaders = H_GET_32 (abfd, src->SizeOfHeaders);
   a->CheckSum = H_GET_32 (abfd, src->CheckSum);
   a->Subsystem = H_GET_16 (abfd, src->Subsystem);
   a->DllCharacteristics = H_GET_16 (abfd, src->DllCharacteristics);
   a->SizeOfStackReserve =
     GET_OPTHDR_SIZE_OF_STACK_RESERVE (abfd, src->SizeOfStackReserve);
   a->SizeOfStackCommit =
     GET_OPTHDR_SIZE_OF_STACK_COMMIT (abfd, src->SizeOfStackCommit);
   a->SizeOfHeapReserve =
     GET_OPTHDR_SIZE_OF_HEAP_RESERVE (abfd, src->SizeOfHeapReserve);
   a->SizeOfHeapCommit =
     GET_OPTHDR_SIZE_OF_HEAP_COMMIT (abfd, src->SizeOfHeapCommit);
   a->LoaderFlags = H_GET_32 (abfd, src->LoaderFlags);
   a->NumberOfRvaAndSizes = H_GET_32 (abfd, src->NumberOfRvaAndSizes);
 
   {
     int idx;
 
+    /* PR 17512: Corrupt PE binaries can cause seg-faults.  */
+    if (a->NumberOfRvaAndSizes > 16)
+      {
+       (*_bfd_error_handler)
+	  (_("%B: aout header specifies an invalid number of data-directory entries: %d"),
+	   abfd, a->NumberOfRvaAndSizes);
+	/* Paranoia: If the number is corrupt, then assume that the
+	   actual entries themselves might be corrupt as well.  */
+	a->NumberOfRvaAndSizes = 0;
+      }
+
     for (idx = 0; idx < 16; idx++)
       {
         /* If data directory is empty, rva also should be 0.  */
 	int size =
 	  H_GET_32 (abfd, src->DataDirectory[idx][1]);
 
 	a->DataDirectory[idx].Size = size;
 
 	if (size)
 	  a->DataDirectory[idx].VirtualAddress =
 	    H_GET_32 (abfd, src->DataDirectory[idx][0]);
 	else
 	  a->DataDirectory[idx].VirtualAddress = 0;
       }
   }
 
   if (aouthdr_int->entry)
     {
       aouthdr_int->entry += a->ImageBase;
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
       aouthdr_int->entry &= 0xffffffff;
 #endif
     }
 
   if (aouthdr_int->tsize)
     {
       aouthdr_int->text_start += a->ImageBase;
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
       aouthdr_int->text_start &= 0xffffffff;
 #endif
     }
 
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
   /* PE32+ does not have data_start member!  */
   if (aouthdr_int->dsize)
     {
       aouthdr_int->data_start += a->ImageBase;
       aouthdr_int->data_start &= 0xffffffff;
     }
 #endif
 
 #ifdef POWERPC_LE_PE
   /* These three fields are normally set up by ppc_relocate_section.
      In the case of reading a file in, we can pick them up from the
      DataDirectory.  */
   first_thunk_address = a->DataDirectory[PE_IMPORT_ADDRESS_TABLE].VirtualAddress;
   thunk_size = a->DataDirectory[PE_IMPORT_ADDRESS_TABLE].Size;
   import_table_size = a->DataDirectory[PE_IMPORT_TABLE].Size;
 #endif
 }
 
 /* A support function for below.  */
 
 static void
 add_data_entry (bfd * abfd,
 		struct internal_extra_pe_aouthdr *aout,
 		int idx,
 		char *name,
 		bfd_vma base)
 {
   asection *sec = bfd_get_section_by_name (abfd, name);
 
   /* Add import directory information if it exists.  */
   if ((sec != NULL)
       && (coff_section_data (abfd, sec) != NULL)
       && (pei_section_data (abfd, sec) != NULL))
     {
       /* If data directory is empty, rva also should be 0.  */
       int size = pei_section_data (abfd, sec)->virt_size;
       aout->DataDirectory[idx].Size = size;
 
       if (size)
 	{
 	  aout->DataDirectory[idx].VirtualAddress =
 	    (sec->vma - base) & 0xffffffff;
 	  sec->flags |= SEC_DATA;
 	}
     }
 }
 
 unsigned int
 _bfd_XXi_swap_aouthdr_out (bfd * abfd, void * in, void * out)
 {
   struct internal_aouthdr *aouthdr_in = (struct internal_aouthdr *) in;
   pe_data_type *pe = pe_data (abfd);
   struct internal_extra_pe_aouthdr *extra = &pe->pe_opthdr;
   PEAOUTHDR *aouthdr_out = (PEAOUTHDR *) out;
   bfd_vma sa, fa, ib;
   IMAGE_DATA_DIRECTORY idata2, idata5, tls;
   
   if (pe->force_minimum_alignment)
     {
       if (!extra->FileAlignment)
 	extra->FileAlignment = PE_DEF_FILE_ALIGNMENT;
       if (!extra->SectionAlignment)
 	extra->SectionAlignment = PE_DEF_SECTION_ALIGNMENT;
     }
 
   if (extra->Subsystem == IMAGE_SUBSYSTEM_UNKNOWN)
     extra->Subsystem = pe->target_subsystem;
 
   sa = extra->SectionAlignment;
   fa = extra->FileAlignment;
   ib = extra->ImageBase;
 
   idata2 = pe->pe_opthdr.DataDirectory[PE_IMPORT_TABLE];
   idata5 = pe->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE];
   tls = pe->pe_opthdr.DataDirectory[PE_TLS_TABLE];
   
   if (aouthdr_in->tsize)
     {
       aouthdr_in->text_start -= ib;
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
       aouthdr_in->text_start &= 0xffffffff;
 #endif
     }
 
   if (aouthdr_in->dsize)
     {
       aouthdr_in->data_start -= ib;
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
       aouthdr_in->data_start &= 0xffffffff;
 #endif
     }
 
   if (aouthdr_in->entry)
     {
       aouthdr_in->entry -= ib;
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
       aouthdr_in->entry &= 0xffffffff;
 #endif
     }
 
 #define FA(x) (((x) + fa -1 ) & (- fa))
 #define SA(x) (((x) + sa -1 ) & (- sa))
 
   /* We like to have the sizes aligned.  */
   aouthdr_in->bsize = FA (aouthdr_in->bsize);
 
   extra->NumberOfRvaAndSizes = IMAGE_NUMBEROF_DIRECTORY_ENTRIES;
 
   /* First null out all data directory entries.  */
   memset (extra->DataDirectory, 0, sizeof (extra->DataDirectory));
 
   add_data_entry (abfd, extra, 0, ".edata", ib);
   add_data_entry (abfd, extra, 2, ".rsrc", ib);
   add_data_entry (abfd, extra, 3, ".pdata", ib);
 
   /* In theory we do not need to call add_data_entry for .idata$2 or
      .idata$5.  It will be done in bfd_coff_final_link where all the
      required information is available.  If however, we are not going
      to perform a final link, eg because we have been invoked by objcopy
      or strip, then we need to make sure that these Data Directory
      entries are initialised properly.
 
      So - we copy the input values into the output values, and then, if
      a final link is going to be performed, it can overwrite them.  */
   extra->DataDirectory[PE_IMPORT_TABLE]  = idata2;
   extra->DataDirectory[PE_IMPORT_ADDRESS_TABLE] = idata5;
   extra->DataDirectory[PE_TLS_TABLE] = tls;
 
   if (extra->DataDirectory[PE_IMPORT_TABLE].VirtualAddress == 0)
     /* Until other .idata fixes are made (pending patch), the entry for
        .idata is needed for backwards compatibility.  FIXME.  */
     add_data_entry (abfd, extra, 1, ".idata", ib);
     
   /* For some reason, the virtual size (which is what's set by
      add_data_entry) for .reloc is not the same as the size recorded
      in this slot by MSVC; it doesn't seem to cause problems (so far),
      but since it's the best we've got, use it.  It does do the right
      thing for .pdata.  */
   if (pe->has_reloc_section)
     add_data_entry (abfd, extra, 5, ".reloc", ib);
 
   {
     asection *sec;
     bfd_vma hsize = 0;
     bfd_vma dsize = 0;
     bfd_vma isize = 0;
     bfd_vma tsize = 0;
 
     for (sec = abfd->sections; sec; sec = sec->next)
       {
 	int rounded = FA (sec->size);
 
 	/* The first non-zero section filepos is the header size.
 	   Sections without contents will have a filepos of 0.  */
 	if (hsize == 0)
 	  hsize = sec->filepos;
 	if (sec->flags & SEC_DATA)
 	  dsize += rounded;
 	if (sec->flags & SEC_CODE)
 	  tsize += rounded;
 	/* The image size is the total VIRTUAL size (which is what is
 	   in the virt_size field).  Files have been seen (from MSVC
 	   5.0 link.exe) where the file size of the .data segment is
 	   quite small compared to the virtual size.  Without this
 	   fix, strip munges the file.
 
 	   FIXME: We need to handle holes between sections, which may
 	   happpen when we covert from another format.  We just use
 	   the virtual address and virtual size of the last section
 	   for the image size.  */
 	if (coff_section_data (abfd, sec) != NULL
 	    && pei_section_data (abfd, sec) != NULL)
 	  isize = (sec->vma - extra->ImageBase
 		   + SA (FA (pei_section_data (abfd, sec)->virt_size)));
       }
 
     aouthdr_in->dsize = dsize;
     aouthdr_in->tsize = tsize;
     extra->SizeOfHeaders = hsize;
     extra->SizeOfImage = isize;
   }
 
   H_PUT_16 (abfd, aouthdr_in->magic, aouthdr_out->standard.magic);
 
 #define LINKER_VERSION 256 /* That is, 2.56 */
 
   /* This piece of magic sets the "linker version" field to
      LINKER_VERSION.  */
   H_PUT_16 (abfd, (LINKER_VERSION / 100 + (LINKER_VERSION % 100) * 256),
 	    aouthdr_out->standard.vstamp);
 
   PUT_AOUTHDR_TSIZE (abfd, aouthdr_in->tsize, aouthdr_out->standard.tsize);
   PUT_AOUTHDR_DSIZE (abfd, aouthdr_in->dsize, aouthdr_out->standard.dsize);
   PUT_AOUTHDR_BSIZE (abfd, aouthdr_in->bsize, aouthdr_out->standard.bsize);
   PUT_AOUTHDR_ENTRY (abfd, aouthdr_in->entry, aouthdr_out->standard.entry);
   PUT_AOUTHDR_TEXT_START (abfd, aouthdr_in->text_start,
 			  aouthdr_out->standard.text_start);
 
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
   /* PE32+ does not have data_start member!  */
   PUT_AOUTHDR_DATA_START (abfd, aouthdr_in->data_start,
 			  aouthdr_out->standard.data_start);
 #endif
 
   PUT_OPTHDR_IMAGE_BASE (abfd, extra->ImageBase, aouthdr_out->ImageBase);
   H_PUT_32 (abfd, extra->SectionAlignment, aouthdr_out->SectionAlignment);
   H_PUT_32 (abfd, extra->FileAlignment, aouthdr_out->FileAlignment);
   H_PUT_16 (abfd, extra->MajorOperatingSystemVersion,
 	    aouthdr_out->MajorOperatingSystemVersion);
   H_PUT_16 (abfd, extra->MinorOperatingSystemVersion,
 	    aouthdr_out->MinorOperatingSystemVersion);
   H_PUT_16 (abfd, extra->MajorImageVersion, aouthdr_out->MajorImageVersion);
   H_PUT_16 (abfd, extra->MinorImageVersion, aouthdr_out->MinorImageVersion);
   H_PUT_16 (abfd, extra->MajorSubsystemVersion,
 	    aouthdr_out->MajorSubsystemVersion);
   H_PUT_16 (abfd, extra->MinorSubsystemVersion,
 	    aouthdr_out->MinorSubsystemVersion);
   H_PUT_32 (abfd, extra->Reserved1, aouthdr_out->Reserved1);
   H_PUT_32 (abfd, extra->SizeOfImage, aouthdr_out->SizeOfImage);
   H_PUT_32 (abfd, extra->SizeOfHeaders, aouthdr_out->SizeOfHeaders);
   H_PUT_32 (abfd, extra->CheckSum, aouthdr_out->CheckSum);
   H_PUT_16 (abfd, extra->Subsystem, aouthdr_out->Subsystem);
   H_PUT_16 (abfd, extra->DllCharacteristics, aouthdr_out->DllCharacteristics);
   PUT_OPTHDR_SIZE_OF_STACK_RESERVE (abfd, extra->SizeOfStackReserve,
 				    aouthdr_out->SizeOfStackReserve);
   PUT_OPTHDR_SIZE_OF_STACK_COMMIT (abfd, extra->SizeOfStackCommit,
 				   aouthdr_out->SizeOfStackCommit);
   PUT_OPTHDR_SIZE_OF_HEAP_RESERVE (abfd, extra->SizeOfHeapReserve,
 				   aouthdr_out->SizeOfHeapReserve);
   PUT_OPTHDR_SIZE_OF_HEAP_COMMIT (abfd, extra->SizeOfHeapCommit,
 				  aouthdr_out->SizeOfHeapCommit);
   H_PUT_32 (abfd, extra->LoaderFlags, aouthdr_out->LoaderFlags);
   H_PUT_32 (abfd, extra->NumberOfRvaAndSizes,
 	    aouthdr_out->NumberOfRvaAndSizes);
   {
     int idx;
 
     for (idx = 0; idx < 16; idx++)
       {
 	H_PUT_32 (abfd, extra->DataDirectory[idx].VirtualAddress,
 		  aouthdr_out->DataDirectory[idx][0]);
 	H_PUT_32 (abfd, extra->DataDirectory[idx].Size,
 		  aouthdr_out->DataDirectory[idx][1]);
       }
   }
 
   return AOUTSZ;
 }
 
 unsigned int
 _bfd_XXi_only_swap_filehdr_out (bfd * abfd, void * in, void * out)
 {
   int idx;
   struct internal_filehdr *filehdr_in = (struct internal_filehdr *) in;
   struct external_PEI_filehdr *filehdr_out = (struct external_PEI_filehdr *) out;
 
   if (pe_data (abfd)->has_reloc_section)
     filehdr_in->f_flags &= ~F_RELFLG;
 
   if (pe_data (abfd)->dll)
     filehdr_in->f_flags |= F_DLL;
 
   filehdr_in->pe.e_magic    = DOSMAGIC;
   filehdr_in->pe.e_cblp     = 0x90;
   filehdr_in->pe.e_cp       = 0x3;
   filehdr_in->pe.e_crlc     = 0x0;
   filehdr_in->pe.e_cparhdr  = 0x4;
   filehdr_in->pe.e_minalloc = 0x0;
   filehdr_in->pe.e_maxalloc = 0xffff;
   filehdr_in->pe.e_ss       = 0x0;
   filehdr_in->pe.e_sp       = 0xb8;
   filehdr_in->pe.e_csum     = 0x0;
   filehdr_in->pe.e_ip       = 0x0;
   filehdr_in->pe.e_cs       = 0x0;
   filehdr_in->pe.e_lfarlc   = 0x40;
   filehdr_in->pe.e_ovno     = 0x0;
 
   for (idx = 0; idx < 4; idx++)
     filehdr_in->pe.e_res[idx] = 0x0;
 
   filehdr_in->pe.e_oemid   = 0x0;
   filehdr_in->pe.e_oeminfo = 0x0;
 
   for (idx = 0; idx < 10; idx++)
     filehdr_in->pe.e_res2[idx] = 0x0;
 
   filehdr_in->pe.e_lfanew = 0x80;
 
   /* This next collection of data are mostly just characters.  It
      appears to be constant within the headers put on NT exes.  */
   filehdr_in->pe.dos_message[0]  = 0x0eba1f0e;
   filehdr_in->pe.dos_message[1]  = 0xcd09b400;
   filehdr_in->pe.dos_message[2]  = 0x4c01b821;
   filehdr_in->pe.dos_message[3]  = 0x685421cd;
   filehdr_in->pe.dos_message[4]  = 0x70207369;
   filehdr_in->pe.dos_message[5]  = 0x72676f72;
   filehdr_in->pe.dos_message[6]  = 0x63206d61;
   filehdr_in->pe.dos_message[7]  = 0x6f6e6e61;
   filehdr_in->pe.dos_message[8]  = 0x65622074;
   filehdr_in->pe.dos_message[9]  = 0x6e757220;
   filehdr_in->pe.dos_message[10] = 0x206e6920;
   filehdr_in->pe.dos_message[11] = 0x20534f44;
   filehdr_in->pe.dos_message[12] = 0x65646f6d;
   filehdr_in->pe.dos_message[13] = 0x0a0d0d2e;
   filehdr_in->pe.dos_message[14] = 0x24;
   filehdr_in->pe.dos_message[15] = 0x0;
   filehdr_in->pe.nt_signature = NT_SIGNATURE;
 
   H_PUT_16 (abfd, filehdr_in->f_magic, filehdr_out->f_magic);
   H_PUT_16 (abfd, filehdr_in->f_nscns, filehdr_out->f_nscns);
 
   H_PUT_32 (abfd, time (0), filehdr_out->f_timdat);
   PUT_FILEHDR_SYMPTR (abfd, filehdr_in->f_symptr,
 		      filehdr_out->f_symptr);
   H_PUT_32 (abfd, filehdr_in->f_nsyms, filehdr_out->f_nsyms);
   H_PUT_16 (abfd, filehdr_in->f_opthdr, filehdr_out->f_opthdr);
   H_PUT_16 (abfd, filehdr_in->f_flags, filehdr_out->f_flags);
 
   /* Put in extra dos header stuff.  This data remains essentially
      constant, it just has to be tacked on to the beginning of all exes
      for NT.  */
   H_PUT_16 (abfd, filehdr_in->pe.e_magic, filehdr_out->e_magic);
   H_PUT_16 (abfd, filehdr_in->pe.e_cblp, filehdr_out->e_cblp);
   H_PUT_16 (abfd, filehdr_in->pe.e_cp, filehdr_out->e_cp);
   H_PUT_16 (abfd, filehdr_in->pe.e_crlc, filehdr_out->e_crlc);
   H_PUT_16 (abfd, filehdr_in->pe.e_cparhdr, filehdr_out->e_cparhdr);
   H_PUT_16 (abfd, filehdr_in->pe.e_minalloc, filehdr_out->e_minalloc);
   H_PUT_16 (abfd, filehdr_in->pe.e_maxalloc, filehdr_out->e_maxalloc);
   H_PUT_16 (abfd, filehdr_in->pe.e_ss, filehdr_out->e_ss);
   H_PUT_16 (abfd, filehdr_in->pe.e_sp, filehdr_out->e_sp);
   H_PUT_16 (abfd, filehdr_in->pe.e_csum, filehdr_out->e_csum);
   H_PUT_16 (abfd, filehdr_in->pe.e_ip, filehdr_out->e_ip);
   H_PUT_16 (abfd, filehdr_in->pe.e_cs, filehdr_out->e_cs);
   H_PUT_16 (abfd, filehdr_in->pe.e_lfarlc, filehdr_out->e_lfarlc);
   H_PUT_16 (abfd, filehdr_in->pe.e_ovno, filehdr_out->e_ovno);
 
   for (idx = 0; idx < 4; idx++)
     H_PUT_16 (abfd, filehdr_in->pe.e_res[idx], filehdr_out->e_res[idx]);
 
   H_PUT_16 (abfd, filehdr_in->pe.e_oemid, filehdr_out->e_oemid);
   H_PUT_16 (abfd, filehdr_in->pe.e_oeminfo, filehdr_out->e_oeminfo);
 
   for (idx = 0; idx < 10; idx++)
     H_PUT_16 (abfd, filehdr_in->pe.e_res2[idx], filehdr_out->e_res2[idx]);
 
   H_PUT_32 (abfd, filehdr_in->pe.e_lfanew, filehdr_out->e_lfanew);
 
   for (idx = 0; idx < 16; idx++)
     H_PUT_32 (abfd, filehdr_in->pe.dos_message[idx],
 	      filehdr_out->dos_message[idx]);
 
   /* Also put in the NT signature.  */
   H_PUT_32 (abfd, filehdr_in->pe.nt_signature, filehdr_out->nt_signature);
 
   return FILHSZ;
 }
 
 unsigned int
 _bfd_XX_only_swap_filehdr_out (bfd * abfd, void * in, void * out)
 {
   struct internal_filehdr *filehdr_in = (struct internal_filehdr *) in;
   FILHDR *filehdr_out = (FILHDR *) out;
 
   H_PUT_16 (abfd, filehdr_in->f_magic, filehdr_out->f_magic);
   H_PUT_16 (abfd, filehdr_in->f_nscns, filehdr_out->f_nscns);
   H_PUT_32 (abfd, filehdr_in->f_timdat, filehdr_out->f_timdat);
   PUT_FILEHDR_SYMPTR (abfd, filehdr_in->f_symptr, filehdr_out->f_symptr);
   H_PUT_32 (abfd, filehdr_in->f_nsyms, filehdr_out->f_nsyms);
   H_PUT_16 (abfd, filehdr_in->f_opthdr, filehdr_out->f_opthdr);
   H_PUT_16 (abfd, filehdr_in->f_flags, filehdr_out->f_flags);
 
   return FILHSZ;
 }
 
 unsigned int
 _bfd_XXi_swap_scnhdr_out (bfd * abfd, void * in, void * out)
 {
   struct internal_scnhdr *scnhdr_int = (struct internal_scnhdr *) in;
   SCNHDR *scnhdr_ext = (SCNHDR *) out;
   unsigned int ret = SCNHSZ;
   bfd_vma ps;
   bfd_vma ss;
 
   memcpy (scnhdr_ext->s_name, scnhdr_int->s_name, sizeof (scnhdr_int->s_name));
 
   PUT_SCNHDR_VADDR (abfd,
 		    ((scnhdr_int->s_vaddr
 		      - pe_data (abfd)->pe_opthdr.ImageBase)
 		     & 0xffffffff),
 		    scnhdr_ext->s_vaddr);
 
   /* NT wants the size data to be rounded up to the next
      NT_FILE_ALIGNMENT, but zero if it has no content (as in .bss,
      sometimes).  */
   if ((scnhdr_int->s_flags & IMAGE_SCN_CNT_UNINITIALIZED_DATA) != 0)
     {
       if (bfd_pe_executable_p (abfd))
 	{
 	  ps = scnhdr_int->s_size;
 	  ss = 0;
 	}
       else
        {
          ps = 0;
          ss = scnhdr_int->s_size;
        }
     }
   else
     {
       if (bfd_pe_executable_p (abfd))
 	ps = scnhdr_int->s_paddr;
       else
 	ps = 0;
 
       ss = scnhdr_int->s_size;
     }
 
   PUT_SCNHDR_SIZE (abfd, ss,
 		   scnhdr_ext->s_size);
 
   /* s_paddr in PE is really the virtual size.  */
   PUT_SCNHDR_PADDR (abfd, ps, scnhdr_ext->s_paddr);
 
   PUT_SCNHDR_SCNPTR (abfd, scnhdr_int->s_scnptr,
 		     scnhdr_ext->s_scnptr);
   PUT_SCNHDR_RELPTR (abfd, scnhdr_int->s_relptr,
 		     scnhdr_ext->s_relptr);
   PUT_SCNHDR_LNNOPTR (abfd, scnhdr_int->s_lnnoptr,
 		      scnhdr_ext->s_lnnoptr);
 
   {
     /* Extra flags must be set when dealing with PE.  All sections should also
        have the IMAGE_SCN_MEM_READ (0x40000000) flag set.  In addition, the
        .text section must have IMAGE_SCN_MEM_EXECUTE (0x20000000) and the data
        sections (.idata, .data, .bss, .CRT) must have IMAGE_SCN_MEM_WRITE set
        (this is especially important when dealing with the .idata section since
        the addresses for routines from .dlls must be overwritten).  If .reloc
        section data is ever generated, we must add IMAGE_SCN_MEM_DISCARDABLE
        (0x02000000).  Also, the resource data should also be read and
        writable.  */
 
     /* FIXME: Alignment is also encoded in this field, at least on PPC and 
        ARM-WINCE.  Although - how do we get the original alignment field
        back ?  */
 
     typedef struct
     {
       const char * 	section_name;
       unsigned long	must_have;
     }
     pe_required_section_flags;
     
     pe_required_section_flags known_sections [] =
       {
 	{ ".arch",  IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_DISCARDABLE | IMAGE_SCN_ALIGN_8BYTES },
 	{ ".bss",   IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_UNINITIALIZED_DATA | IMAGE_SCN_MEM_WRITE },
 	{ ".data",  IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE },
 	{ ".edata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA },
 	{ ".idata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE },
 	{ ".pdata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA },
 	{ ".rdata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA },
 	{ ".reloc", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_DISCARDABLE },
 	{ ".rsrc",  IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE },
 	{ ".text" , IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_CODE | IMAGE_SCN_MEM_EXECUTE },
 	{ ".tls",   IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_WRITE },
 	{ ".xdata", IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA },
 	{ NULL, 0}
       };
 
     pe_required_section_flags * p;
 
     /* We have defaulted to adding the IMAGE_SCN_MEM_WRITE flag, but now
        we know exactly what this specific section wants so we remove it
        and then allow the must_have field to add it back in if necessary.
        However, we don't remove IMAGE_SCN_MEM_WRITE flag from .text if the
        default WP_TEXT file flag has been cleared.  WP_TEXT may be cleared
        by ld --enable-auto-import (if auto-import is actually needed),
        by ld --omagic, or by obcopy --writable-text.  */
 
     for (p = known_sections; p->section_name; p++)
       if (strcmp (scnhdr_int->s_name, p->section_name) == 0)
 	{
 	  if (strcmp (scnhdr_int->s_name, ".text")
 	      || (bfd_get_file_flags (abfd) & WP_TEXT))
 	    scnhdr_int->s_flags &= ~IMAGE_SCN_MEM_WRITE;
 	  scnhdr_int->s_flags |= p->must_have;
 	  break;
 	}
 
     H_PUT_32 (abfd, scnhdr_int->s_flags, scnhdr_ext->s_flags);
   }
 
   if (coff_data (abfd)->link_info
       && ! coff_data (abfd)->link_info->relocatable
       && ! coff_data (abfd)->link_info->shared
       && strcmp (scnhdr_int->s_name, ".text") == 0)
     {
       /* By inference from looking at MS output, the 32 bit field
 	 which is the combination of the number_of_relocs and
 	 number_of_linenos is used for the line number count in
 	 executables.  A 16-bit field won't do for cc1.  The MS
 	 document says that the number of relocs is zero for
 	 executables, but the 17-th bit has been observed to be there.
 	 Overflow is not an issue: a 4G-line program will overflow a
 	 bunch of other fields long before this!  */
       H_PUT_16 (abfd, (scnhdr_int->s_nlnno & 0xffff), scnhdr_ext->s_nlnno);
       H_PUT_16 (abfd, (scnhdr_int->s_nlnno >> 16), scnhdr_ext->s_nreloc);
     }
   else
     {
       if (scnhdr_int->s_nlnno <= 0xffff)
 	H_PUT_16 (abfd, scnhdr_int->s_nlnno, scnhdr_ext->s_nlnno);
       else
 	{
 	  (*_bfd_error_handler) (_("%s: line number overflow: 0x%lx > 0xffff"),
 				 bfd_get_filename (abfd),
 				 scnhdr_int->s_nlnno);
 	  bfd_set_error (bfd_error_file_truncated);
 	  H_PUT_16 (abfd, 0xffff, scnhdr_ext->s_nlnno);
 	  ret = 0;
 	}
 
       /* Although we could encode 0xffff relocs here, we do not, to be
          consistent with other parts of bfd. Also it lets us warn, as
          we should never see 0xffff here w/o having the overflow flag
          set.  */
       if (scnhdr_int->s_nreloc < 0xffff)
 	H_PUT_16 (abfd, scnhdr_int->s_nreloc, scnhdr_ext->s_nreloc);
       else
 	{
 	  /* PE can deal with large #s of relocs, but not here.  */
 	  H_PUT_16 (abfd, 0xffff, scnhdr_ext->s_nreloc);
 	  scnhdr_int->s_flags |= IMAGE_SCN_LNK_NRELOC_OVFL;
 	  H_PUT_32 (abfd, scnhdr_int->s_flags, scnhdr_ext->s_flags);
 	}
     }
   return ret;
 }
 
 static char * dir_names[IMAGE_NUMBEROF_DIRECTORY_ENTRIES] =
 {
   N_("Export Directory [.edata (or where ever we found it)]"),
   N_("Import Directory [parts of .idata]"),
   N_("Resource Directory [.rsrc]"),
   N_("Exception Directory [.pdata]"),
   N_("Security Directory"),
   N_("Base Relocation Directory [.reloc]"),
   N_("Debug Directory"),
   N_("Description Directory"),
   N_("Special Directory"),
   N_("Thread Storage Directory [.tls]"),
   N_("Load Configuration Directory"),
   N_("Bound Import Directory"),
   N_("Import Address Table Directory"),
   N_("Delay Import Directory"),
   N_("CLR Runtime Header"),
   N_("Reserved")
 };
 
 #ifdef POWERPC_LE_PE
 /* The code for the PPC really falls in the "architecture dependent"
    category.  However, it's not clear that anyone will ever care, so
    we're ignoring the issue for now; if/when PPC matters, some of this
    may need to go into peicode.h, or arguments passed to enable the
    PPC- specific code.  */
 #endif
 
 static bfd_boolean
 pe_print_idata (bfd * abfd, void * vfile)
 {
   FILE *file = (FILE *) vfile;
   bfd_byte *data;
   asection *section;
   bfd_signed_vma adj;
 
 #ifdef POWERPC_LE_PE
   asection *rel_section = bfd_get_section_by_name (abfd, ".reldata");
 #endif
 
   bfd_size_type datasize = 0;
   bfd_size_type dataoff;
   bfd_size_type i;
   int onaline = 20;
 
   pe_data_type *pe = pe_data (abfd);
   struct internal_extra_pe_aouthdr *extra = &pe->pe_opthdr;
 
   bfd_vma addr;
 
   addr = extra->DataDirectory[PE_IMPORT_TABLE].VirtualAddress;
 
   if (addr == 0 && extra->DataDirectory[PE_IMPORT_TABLE].Size == 0)
     {
       /* Maybe the extra header isn't there.  Look for the section.  */
       section = bfd_get_section_by_name (abfd, ".idata");
       if (section == NULL)
 	return TRUE;
 
       addr = section->vma;
       datasize = section->size;
       if (datasize == 0)
 	return TRUE;
     }
   else
     {
       addr += extra->ImageBase;
       for (section = abfd->sections; section != NULL; section = section->next)
 	{
 	  datasize = section->size;
 	  if (addr >= section->vma && addr < section->vma + datasize)
 	    break;
 	}
 
       if (section == NULL)
 	{
 	  fprintf (file,
 		   _("\nThere is an import table, but the section containing it could not be found\n"));
 	  return TRUE;
 	}
     }
 
   fprintf (file, _("\nThere is an import table in %s at 0x%lx\n"),
 	   section->name, (unsigned long) addr);
 
   dataoff = addr - section->vma;
   datasize -= dataoff;
 
 #ifdef POWERPC_LE_PE
   if (rel_section != 0 && rel_section->size != 0)
     {
       /* The toc address can be found by taking the starting address,
 	 which on the PPC locates a function descriptor. The
 	 descriptor consists of the function code starting address
 	 followed by the address of the toc. The starting address we
 	 get from the bfd, and the descriptor is supposed to be in the
 	 .reldata section.  */
 
       bfd_vma loadable_toc_address;
       bfd_vma toc_address;
       bfd_vma start_address;
       bfd_byte *data;
       bfd_vma offset;
 
       if (!bfd_malloc_and_get_section (abfd, rel_section, &data))
 	{
 	  if (data != NULL)
 	    free (data);
 	  return FALSE;
 	}
 
       offset = abfd->start_address - rel_section->vma;
 
       if (offset >= rel_section->size || offset + 8 > rel_section->size)
         {
           if (data != NULL)
             free (data);
           return FALSE;
         }
 
       start_address = bfd_get_32 (abfd, data + offset);
       loadable_toc_address = bfd_get_32 (abfd, data + offset + 4);
       toc_address = loadable_toc_address - 32768;
 
       fprintf (file,
 	       _("\nFunction descriptor located at the start address: %04lx\n"),
 	       (unsigned long int) (abfd->start_address));
       fprintf (file,
 	       _("\tcode-base %08lx toc (loadable/actual) %08lx/%08lx\n"),
 	       start_address, loadable_toc_address, toc_address);
       if (data != NULL)
 	free (data);
     }
   else
     {
       fprintf (file,
 	       _("\nNo reldata section! Function descriptor not decoded.\n"));
     }
 #endif
 
   fprintf (file,
 	   _("\nThe Import Tables (interpreted %s section contents)\n"),
 	   section->name);
   fprintf (file,
 	   _("\
  vma:            Hint    Time      Forward  DLL       First\n\
                  Table   Stamp     Chain    Name      Thunk\n"));
 
   /* Read the whole section.  Some of the fields might be before dataoff.  */
   if (!bfd_malloc_and_get_section (abfd, section, &data))
     {
       if (data != NULL)
 	free (data);
       return FALSE;
     }
 
   adj = section->vma - extra->ImageBase;
 
   /* Print all image import descriptors.  */
   for (i = 0; i < datasize; i += onaline)
     {
       bfd_vma hint_addr;
       bfd_vma time_stamp;
       bfd_vma forward_chain;
       bfd_vma dll_name;
       bfd_vma first_thunk;
       int idx = 0;
       bfd_size_type j;
       char *dll;
 
       /* Print (i + extra->DataDirectory[PE_IMPORT_TABLE].VirtualAddress).  */
       fprintf (file, " %08lx\t", (unsigned long) (i + adj + dataoff));
       hint_addr = bfd_get_32 (abfd, data + i + dataoff);
       time_stamp = bfd_get_32 (abfd, data + i + 4 + dataoff);
       forward_chain = bfd_get_32 (abfd, data + i + 8 + dataoff);
       dll_name = bfd_get_32 (abfd, data + i + 12 + dataoff);
       first_thunk = bfd_get_32 (abfd, data + i + 16 + dataoff);
 
       fprintf (file, "%08lx %08lx %08lx %08lx %08lx\n",
 	       (unsigned long) hint_addr,
 	       (unsigned long) time_stamp,
 	       (unsigned long) forward_chain,
 	       (unsigned long) dll_name,
 	       (unsigned long) first_thunk);
 
       if (hint_addr == 0 && first_thunk == 0)
 	break;
 
       if (dll_name - adj >= section->size)
         break;
 
       dll = (char *) data + dll_name - adj;
       fprintf (file, _("\n\tDLL Name: %s\n"), dll);
 
       if (hint_addr != 0)
 	{
 	  bfd_byte *ft_data;
 	  asection *ft_section;
 	  bfd_vma ft_addr;
 	  bfd_size_type ft_datasize;
 	  int ft_idx;
 	  int ft_allocated = 0;
 
 	  fprintf (file, _("\tvma:  Hint/Ord Member-Name Bound-To\n"));
 
 	  idx = hint_addr - adj;
 	  
 	  ft_addr = first_thunk + extra->ImageBase;
 	  ft_data = data;
 	  ft_idx = first_thunk - adj;
 	  ft_allocated = 0; 
 
 	  if (first_thunk != hint_addr)
 	    {
 	      /* Find the section which contains the first thunk.  */
 	      for (ft_section = abfd->sections;
 		   ft_section != NULL;
 		   ft_section = ft_section->next)
 		{
 		  ft_datasize = ft_section->size;
 		  if (ft_addr >= ft_section->vma
 		      && ft_addr < ft_section->vma + ft_datasize)
 		    break;
 		}
 
 	      if (ft_section == NULL)
 		{
 		  fprintf (file,
 		       _("\nThere is a first thunk, but the section containing it could not be found\n"));
 		  continue;
 		}
 
 	      /* Now check to see if this section is the same as our current
 		 section.  If it is not then we will have to load its data in.  */
 	      if (ft_section == section)
 		{
 		  ft_data = data;
 		  ft_idx = first_thunk - adj;
 		}
 	      else
 		{
 		  ft_idx = first_thunk - (ft_section->vma - extra->ImageBase);
 		  ft_data = bfd_malloc (datasize);
 		  if (ft_data == NULL)
 		    continue;
 
 		  /* Read datasize bfd_bytes starting at offset ft_idx.  */
 		  if (! bfd_get_section_contents
 		      (abfd, ft_section, ft_data, (bfd_vma) ft_idx, datasize))
 		    {
 		      free (ft_data);
 		      continue;
 		    }
 
 		  ft_idx = 0;
 		  ft_allocated = 1;
 		}
 	    }
 
 	  /* Print HintName vector entries.  */
 #ifdef COFF_WITH_pex64
 	  for (j = 0; j < datasize; j += 8)
 	    {
 	      unsigned long member = bfd_get_32 (abfd, data + idx + j);
 	      unsigned long member_high = bfd_get_32 (abfd, data + idx + j + 4);
 
 	      if (!member && !member_high)
 		break;
 
 	      if (member_high & 0x80000000)
 		fprintf (file, "\t%lx%08lx\t %4lx%08lx  <none>",
 			 member_high,member, member_high & 0x7fffffff, member);
 	      else
 		{
 		  int ordinal;
 		  char *member_name;
 
 		  ordinal = bfd_get_16 (abfd, data + member - adj);
 		  member_name = (char *) data + member - adj + 2;
 		  fprintf (file, "\t%04lx\t %4d  %s",member, ordinal, member_name);
 		}
 
 	      /* If the time stamp is not zero, the import address
 		 table holds actual addresses.  */
 	      if (time_stamp != 0
 		  && first_thunk != 0
 		  && first_thunk != hint_addr)
 		fprintf (file, "\t%04lx",
 			 (long) bfd_get_32 (abfd, ft_data + ft_idx + j));
 	      fprintf (file, "\n");
 	    }
 #else
 	  for (j = 0; j < datasize; j += 4)
 	    {
 	      unsigned long member = bfd_get_32 (abfd, data + idx + j);
 
 	      /* Print single IMAGE_IMPORT_BY_NAME vector.  */ 
 	      if (member == 0)
 		break;
 
 	      if (member & 0x80000000)
 		fprintf (file, "\t%04lx\t %4lu  <none>",
 			 member, member & 0x7fffffff);
 	      else
 		{
 		  int ordinal;
 		  char *member_name;
 
 		  ordinal = bfd_get_16 (abfd, data + member - adj);
 		  member_name = (char *) data + member - adj + 2;
 		  fprintf (file, "\t%04lx\t %4d  %s",
 			   member, ordinal, member_name);
 		}
 
 	      /* If the time stamp is not zero, the import address
 		 table holds actual addresses.  */
 	      if (time_stamp != 0
 		  && first_thunk != 0
 		  && first_thunk != hint_addr)
 		fprintf (file, "\t%04lx",
 			 (long) bfd_get_32 (abfd, ft_data + ft_idx + j));
 
 	      fprintf (file, "\n");
 	    }
 #endif
 	  if (ft_allocated)
 	    free (ft_data);
 	}
 
       fprintf (file, "\n");
     }
 
   free (data);
 
   return TRUE;
 }
 
 static bfd_boolean
 pe_print_edata (bfd * abfd, void * vfile)
 {
   FILE *file = (FILE *) vfile;
   bfd_byte *data;
   asection *section;
   bfd_size_type datasize = 0;
   bfd_size_type dataoff;
   bfd_size_type i;
   bfd_signed_vma adj;
   struct EDT_type
   {
     long export_flags;          /* Reserved - should be zero.  */
     long time_stamp;
     short major_ver;
     short minor_ver;
     bfd_vma name;               /* RVA - relative to image base.  */
     long base;                  /* Ordinal base.  */
     unsigned long num_functions;/* Number in the export address table.  */
     unsigned long num_names;    /* Number in the name pointer table.  */
     bfd_vma eat_addr;		/* RVA to the export address table.  */
     bfd_vma npt_addr;		/* RVA to the Export Name Pointer Table.  */
     bfd_vma ot_addr;		/* RVA to the Ordinal Table.  */
   } edt;
 
   pe_data_type *pe = pe_data (abfd);
   struct internal_extra_pe_aouthdr *extra = &pe->pe_opthdr;
 
   bfd_vma addr;
 
   addr = extra->DataDirectory[PE_EXPORT_TABLE].VirtualAddress;
 
   if (addr == 0 && extra->DataDirectory[PE_EXPORT_TABLE].Size == 0)
     {
       /* Maybe the extra header isn't there.  Look for the section.  */
       section = bfd_get_section_by_name (abfd, ".edata");
       if (section == NULL)
 	return TRUE;
 
       addr = section->vma;
       dataoff = 0;
       datasize = section->size;
       if (datasize == 0)
 	return TRUE;
     }
   else
     {
       addr += extra->ImageBase;
 
       for (section = abfd->sections; section != NULL; section = section->next)
 	if (addr >= section->vma && addr < section->vma + section->size)
 	  break;
 
       if (section == NULL)
 	{
 	  fprintf (file,
 		   _("\nThere is an export table, but the section containing it could not be found\n"));
 	  return TRUE;
 	}
 
       dataoff = addr - section->vma;
       datasize = extra->DataDirectory[PE_EXPORT_TABLE].Size;
       if (datasize > section->size - dataoff)
 	{
 	  fprintf (file,
 		   _("\nThere is an export table in %s, but it does not fit into that section\n"),
 		   section->name);
 	  return TRUE;
 	}
+    }
+
+  /* PR 17512: Handle corrupt PE binaries.  */
+  if (datasize < 36)
+    {
+      fprintf (file,
+	       _("\nThere is an export table in %s, but it is too small (%d)\n"),
+	       section->name, (int) datasize);
+      return TRUE;
     }
 
   fprintf (file, _("\nThere is an export table in %s at 0x%lx\n"),
 	   section->name, (unsigned long) addr);
 
   data = bfd_malloc (datasize);
   if (data == NULL)
     return FALSE;
 
   if (! bfd_get_section_contents (abfd, section, data,
 				  (file_ptr) dataoff, datasize))
     return FALSE;
 
   /* Go get Export Directory Table.  */
   edt.export_flags   = bfd_get_32 (abfd, data +  0);
   edt.time_stamp     = bfd_get_32 (abfd, data +  4);
   edt.major_ver      = bfd_get_16 (abfd, data +  8);
   edt.minor_ver      = bfd_get_16 (abfd, data + 10);
   edt.name           = bfd_get_32 (abfd, data + 12);
   edt.base           = bfd_get_32 (abfd, data + 16);
   edt.num_functions  = bfd_get_32 (abfd, data + 20);
   edt.num_names      = bfd_get_32 (abfd, data + 24);
   edt.eat_addr       = bfd_get_32 (abfd, data + 28);
   edt.npt_addr       = bfd_get_32 (abfd, data + 32);
   edt.ot_addr        = bfd_get_32 (abfd, data + 36);
 
   adj = section->vma - extra->ImageBase + dataoff;
 
   /* Dump the EDT first.  */
   fprintf (file,
 	   _("\nThe Export Tables (interpreted %s section contents)\n\n"),
 	   section->name);
 
   fprintf (file,
 	   _("Export Flags \t\t\t%lx\n"), (unsigned long) edt.export_flags);
 
   fprintf (file,
 	   _("Time/Date stamp \t\t%lx\n"), (unsigned long) edt.time_stamp);
 
   fprintf (file,
 	   _("Major/Minor \t\t\t%d/%d\n"), edt.major_ver, edt.minor_ver);
 
   fprintf (file,
 	   _("Name \t\t\t\t"));
   fprintf_vma (file, edt.name);
   fprintf (file,
 	   " %s\n", data + edt.name - adj);
 
   fprintf (file,
 	   _("Ordinal Base \t\t\t%ld\n"), edt.base);
 
   fprintf (file,
 	   _("Number in:\n"));
 
   fprintf (file,
 	   _("\tExport Address Table \t\t%08lx\n"),
 	   edt.num_functions);
 
   fprintf (file,
 	   _("\t[Name Pointer/Ordinal] Table\t%08lx\n"), edt.num_names);
 
   fprintf (file,
 	   _("Table Addresses\n"));
 
   fprintf (file,
 	   _("\tExport Address Table \t\t"));
   fprintf_vma (file, edt.eat_addr);
   fprintf (file, "\n");
 
   fprintf (file,
 	   _("\tName Pointer Table \t\t"));
   fprintf_vma (file, edt.npt_addr);
   fprintf (file, "\n");
 
   fprintf (file,
 	   _("\tOrdinal Table \t\t\t"));
   fprintf_vma (file, edt.ot_addr);
   fprintf (file, "\n");
 
   /* The next table to find is the Export Address Table. It's basically
      a list of pointers that either locate a function in this dll, or
      forward the call to another dll. Something like:
       typedef union
       {
         long export_rva;
         long forwarder_rva;
       } export_address_table_entry;  */
 
   fprintf (file,
 	  _("\nExport Address Table -- Ordinal Base %ld\n"),
 	  edt.base);
 
   for (i = 0; i < edt.num_functions; ++i)
     {
       bfd_vma eat_member = bfd_get_32 (abfd,
 				       data + edt.eat_addr + (i * 4) - adj);
       if (eat_member == 0)
 	continue;
 
       if (eat_member - adj <= datasize)
 	{
 	  /* This rva is to a name (forwarding function) in our section.  */
 	  /* Should locate a function descriptor.  */
 	  fprintf (file,
 		   "\t[%4ld] +base[%4ld] %04lx %s -- %s\n",
 		   (long) i,
 		   (long) (i + edt.base),
 		   (unsigned long) eat_member,
 		   _("Forwarder RVA"),
 		   data + eat_member - adj);
 	}
       else
 	{
 	  /* Should locate a function descriptor in the reldata section.  */
 	  fprintf (file,
 		   "\t[%4ld] +base[%4ld] %04lx %s\n",
 		   (long) i,
 		   (long) (i + edt.base),
 		   (unsigned long) eat_member,
 		   _("Export RVA"));
 	}
     }
 
   /* The Export Name Pointer Table is paired with the Export Ordinal Table.  */
   /* Dump them in parallel for clarity.  */
   fprintf (file,
 	   _("\n[Ordinal/Name Pointer] Table\n"));
 
   for (i = 0; i < edt.num_names; ++i)
     {
       bfd_vma name_ptr = bfd_get_32 (abfd,
 				    data +
 				    edt.npt_addr
 				    + (i*4) - adj);
 
       char *name = (char *) data + name_ptr - adj;
 
       bfd_vma ord = bfd_get_16 (abfd,
 				    data +
 				    edt.ot_addr
 				    + (i*2) - adj);
       fprintf (file,
 	      "\t[%4ld] %s\n", (long) ord, name);
     }
 
   free (data);
 
   return TRUE;
 }
 
 /* This really is architecture dependent.  On IA-64, a .pdata entry
    consists of three dwords containing relative virtual addresses that
    specify the start and end address of the code range the entry
    covers and the address of the corresponding unwind info data.  */
 
 static bfd_boolean
 pe_print_pdata (bfd * abfd, void * vfile)
 {
 #if defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
 # define PDATA_ROW_SIZE	(3 * 8)
 #else
 # define PDATA_ROW_SIZE	(5 * 4)
 #endif
   FILE *file = (FILE *) vfile;
   bfd_byte *data = 0;
   asection *section = bfd_get_section_by_name (abfd, ".pdata");
   bfd_size_type datasize = 0;
   bfd_size_type i;
   bfd_size_type start, stop;
   int onaline = PDATA_ROW_SIZE;
 
   if (section == NULL
       || coff_section_data (abfd, section) == NULL
       || pei_section_data (abfd, section) == NULL)
     return TRUE;
 
   stop = pei_section_data (abfd, section)->virt_size;
   if ((stop % onaline) != 0)
     fprintf (file,
 	     _("Warning, .pdata section size (%ld) is not a multiple of %d\n"),
 	     (long) stop, onaline);
 
   fprintf (file,
 	   _("\nThe Function Table (interpreted .pdata section contents)\n"));
 #if defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
   fprintf (file,
 	   _(" vma:\t\t\tBegin Address    End Address      Unwind Info\n"));
 #else
   fprintf (file, _("\
  vma:\t\tBegin    End      EH       EH       PrologEnd  Exception\n\
      \t\tAddress  Address  Handler  Data     Address    Mask\n"));
 #endif
 
   datasize = section->size;
   if (datasize == 0)
     return TRUE;
 
   if (! bfd_malloc_and_get_section (abfd, section, &data))
     {
       if (data != NULL)
 	free (data);
       return FALSE;
     }
 
   start = 0;
 
   for (i = start; i < stop; i += onaline)
     {
       bfd_vma begin_addr;
       bfd_vma end_addr;
       bfd_vma eh_handler;
       bfd_vma eh_data;
       bfd_vma prolog_end_addr;
       int em_data;
 
       if (i + PDATA_ROW_SIZE > stop)
 	break;
 
       begin_addr      = GET_PDATA_ENTRY (abfd, data + i     );
       end_addr        = GET_PDATA_ENTRY (abfd, data + i +  4);
       eh_handler      = GET_PDATA_ENTRY (abfd, data + i +  8);
       eh_data         = GET_PDATA_ENTRY (abfd, data + i + 12);
       prolog_end_addr = GET_PDATA_ENTRY (abfd, data + i + 16);
 
       if (begin_addr == 0 && end_addr == 0 && eh_handler == 0
 	  && eh_data == 0 && prolog_end_addr == 0)
 	/* We are probably into the padding of the section now.  */
 	break;
 
       em_data = ((eh_handler & 0x1) << 2) | (prolog_end_addr & 0x3);
       eh_handler &= ~(bfd_vma) 0x3;
       prolog_end_addr &= ~(bfd_vma) 0x3;
 
       fputc (' ', file);
       fprintf_vma (file, i + section->vma); fputc ('\t', file);
       fprintf_vma (file, begin_addr); fputc (' ', file);
       fprintf_vma (file, end_addr); fputc (' ', file);
       fprintf_vma (file, eh_handler);
 #if !defined(COFF_WITH_pep) || defined(COFF_WITH_pex64)
       fputc (' ', file);
       fprintf_vma (file, eh_data); fputc (' ', file);
       fprintf_vma (file, prolog_end_addr);
       fprintf (file, "   %x", em_data);
 #endif
 
 #ifdef POWERPC_LE_PE
       if (eh_handler == 0 && eh_data != 0)
 	{
 	  /* Special bits here, although the meaning may be a little
 	     mysterious. The only one I know for sure is 0x03
 	     Code Significance
 	     0x00 None
 	     0x01 Register Save Millicode
 	     0x02 Register Restore Millicode
 	     0x03 Glue Code Sequence.  */
 	  switch (eh_data)
 	    {
 	    case 0x01:
 	      fprintf (file, _(" Register save millicode"));
 	      break;
 	    case 0x02:
 	      fprintf (file, _(" Register restore millicode"));
 	      break;
 	    case 0x03:
 	      fprintf (file, _(" Glue code sequence"));
 	      break;
 	    default:
 	      break;
 	    }
 	}
 #endif
       fprintf (file, "\n");
     }
 
   free (data);
 
   return TRUE;
 }
 
 #define IMAGE_REL_BASED_HIGHADJ 4
 static const char * const tbl[] =
 {
   "ABSOLUTE",
   "HIGH",
   "LOW",
   "HIGHLOW",
   "HIGHADJ",
   "MIPS_JMPADDR",
   "SECTION",
   "REL32",
   "RESERVED1",
   "MIPS_JMPADDR16",
   "DIR64",
   "HIGH3ADJ",
   "UNKNOWN",   /* MUST be last.  */
 };
 
 static bfd_boolean
 pe_print_reloc (bfd * abfd, void * vfile)
 {
   FILE *file = (FILE *) vfile;
   bfd_byte *data = 0;
   asection *section = bfd_get_section_by_name (abfd, ".reloc");
   bfd_size_type datasize;
   bfd_size_type i;
   bfd_size_type start, stop;
 
   if (section == NULL)
     return TRUE;
 
   if (section->size == 0)
     return TRUE;
 
   fprintf (file,
 	   _("\n\nPE File Base Relocations (interpreted .reloc section contents)\n"));
 
   datasize = section->size;
   if (! bfd_malloc_and_get_section (abfd, section, &data))
     {
       if (data != NULL)
 	free (data);
       return FALSE;
     }
 
   start = 0;
 
   stop = section->size;
 
   for (i = start; i < stop;)
     {
       int j;
       bfd_vma virtual_address;
       long number, size;
 
       /* The .reloc section is a sequence of blocks, with a header consisting
 	 of two 32 bit quantities, followed by a number of 16 bit entries.  */
       virtual_address = bfd_get_32 (abfd, data+i);
       size = bfd_get_32 (abfd, data+i+4);
       number = (size - 8) / 2;
 
       if (size == 0)
 	break;
 
       fprintf (file,
 	       _("\nVirtual Address: %08lx Chunk size %ld (0x%lx) Number of fixups %ld\n"),
 	       (unsigned long) virtual_address, size, size, number);
 
       for (j = 0; j < number; ++j)
 	{
 	  unsigned short e = bfd_get_16 (abfd, data + i + 8 + j * 2);
 	  unsigned int t = (e & 0xF000) >> 12;
 	  int off = e & 0x0FFF;
 
 	  if (t >= sizeof (tbl) / sizeof (tbl[0]))
 	    t = (sizeof (tbl) / sizeof (tbl[0])) - 1;
 
 	  fprintf (file,
 		   _("\treloc %4d offset %4x [%4lx] %s"),
 		   j, off, (long) (off + virtual_address), tbl[t]);
 
 	  /* HIGHADJ takes an argument, - the next record *is* the
 	     low 16 bits of addend.  */
 	  if (t == IMAGE_REL_BASED_HIGHADJ)
 	    {
 	      fprintf (file, " (%4x)",
 		       ((unsigned int)
 			bfd_get_16 (abfd, data + i + 8 + j * 2 + 2)));
 	      j++;
 	    }
 
 	  fprintf (file, "\n");
 	}
 
       i += size;
     }
 
   free (data);
 
   return TRUE;
 }
 
 /* Print out the program headers.  */
 
 bfd_boolean
 _bfd_XX_print_private_bfd_data_common (bfd * abfd, void * vfile)
 {
   FILE *file = (FILE *) vfile;
   int j;
   pe_data_type *pe = pe_data (abfd);
   struct internal_extra_pe_aouthdr *i = &pe->pe_opthdr;
   const char *subsystem_name = NULL;
   const char *name;
 
   /* The MS dumpbin program reportedly ands with 0xff0f before
      printing the characteristics field.  Not sure why.  No reason to
      emulate it here.  */
   fprintf (file, _("\nCharacteristics 0x%x\n"), pe->real_flags);
 #undef PF
 #define PF(x, y) if (pe->real_flags & x) { fprintf (file, "\t%s\n", y); }
   PF (IMAGE_FILE_RELOCS_STRIPPED, "relocations stripped");
   PF (IMAGE_FILE_EXECUTABLE_IMAGE, "executable");
   PF (IMAGE_FILE_LINE_NUMS_STRIPPED, "line numbers stripped");
   PF (IMAGE_FILE_LOCAL_SYMS_STRIPPED, "symbols stripped");
   PF (IMAGE_FILE_LARGE_ADDRESS_AWARE, "large address aware");
   PF (IMAGE_FILE_BYTES_REVERSED_LO, "little endian");
   PF (IMAGE_FILE_32BIT_MACHINE, "32 bit words");
   PF (IMAGE_FILE_DEBUG_STRIPPED, "debugging information removed");
   PF (IMAGE_FILE_SYSTEM, "system file");
   PF (IMAGE_FILE_DLL, "DLL");
   PF (IMAGE_FILE_BYTES_REVERSED_HI, "big endian");
 #undef PF
 
   /* ctime implies '\n'.  */
   {
     time_t t = pe->coff.timestamp;
     fprintf (file, "\nTime/Date\t\t%s", ctime (&t));
   }
 
 #ifndef IMAGE_NT_OPTIONAL_HDR_MAGIC
 # define IMAGE_NT_OPTIONAL_HDR_MAGIC 0x10b
 #endif
 #ifndef IMAGE_NT_OPTIONAL_HDR64_MAGIC
 # define IMAGE_NT_OPTIONAL_HDR64_MAGIC 0x20b
 #endif
 #ifndef IMAGE_NT_OPTIONAL_HDRROM_MAGIC
 # define IMAGE_NT_OPTIONAL_HDRROM_MAGIC 0x107
 #endif
 
   switch (i->Magic)
     {
     case IMAGE_NT_OPTIONAL_HDR_MAGIC:
       name = "PE32";
       break;
     case IMAGE_NT_OPTIONAL_HDR64_MAGIC:
       name = "PE32+";
       break;
     case IMAGE_NT_OPTIONAL_HDRROM_MAGIC:
       name = "ROM";
       break;
     default:
       name = NULL;
       break;
     }
   fprintf (file, "Magic\t\t\t%04x", i->Magic);
   if (name)
     fprintf (file, "\t(%s)",name);
   fprintf (file, "\nMajorLinkerVersion\t%d\n", i->MajorLinkerVersion);
   fprintf (file, "MinorLinkerVersion\t%d\n", i->MinorLinkerVersion);
   fprintf (file, "SizeOfCode\t\t%08lx\n", i->SizeOfCode);
   fprintf (file, "SizeOfInitializedData\t%08lx\n",
 	   i->SizeOfInitializedData);
   fprintf (file, "SizeOfUninitializedData\t%08lx\n",
 	   i->SizeOfUninitializedData);
   fprintf (file, "AddressOfEntryPoint\t");
   fprintf_vma (file, i->AddressOfEntryPoint);
   fprintf (file, "\nBaseOfCode\t\t");
   fprintf_vma (file, i->BaseOfCode);
 #if !defined(COFF_WITH_pep) && !defined(COFF_WITH_pex64)
   /* PE32+ does not have BaseOfData member!  */
   fprintf (file, "\nBaseOfData\t\t");
   fprintf_vma (file, i->BaseOfData);
 #endif
 
   fprintf (file, "\nImageBase\t\t");
   fprintf_vma (file, i->ImageBase);
   fprintf (file, "\nSectionAlignment\t");
   fprintf_vma (file, i->SectionAlignment);
   fprintf (file, "\nFileAlignment\t\t");
   fprintf_vma (file, i->FileAlignment);
   fprintf (file, "\nMajorOSystemVersion\t%d\n", i->MajorOperatingSystemVersion);
   fprintf (file, "MinorOSystemVersion\t%d\n", i->MinorOperatingSystemVersion);
   fprintf (file, "MajorImageVersion\t%d\n", i->MajorImageVersion);
   fprintf (file, "MinorImageVersion\t%d\n", i->MinorImageVersion);
   fprintf (file, "MajorSubsystemVersion\t%d\n", i->MajorSubsystemVersion);
   fprintf (file, "MinorSubsystemVersion\t%d\n", i->MinorSubsystemVersion);
   fprintf (file, "Win32Version\t\t%08lx\n", i->Reserved1);
   fprintf (file, "SizeOfImage\t\t%08lx\n", i->SizeOfImage);
   fprintf (file, "SizeOfHeaders\t\t%08lx\n", i->SizeOfHeaders);
   fprintf (file, "CheckSum\t\t%08lx\n", i->CheckSum);
 
   switch (i->Subsystem)
     {
     case IMAGE_SUBSYSTEM_UNKNOWN:
       subsystem_name = "unspecified";
       break;
     case IMAGE_SUBSYSTEM_NATIVE:
       subsystem_name = "NT native";
       break;
     case IMAGE_SUBSYSTEM_WINDOWS_GUI:
       subsystem_name = "Windows GUI";
       break;
     case IMAGE_SUBSYSTEM_WINDOWS_CUI:
       subsystem_name = "Windows CUI";
       break;
     case IMAGE_SUBSYSTEM_POSIX_CUI:
       subsystem_name = "POSIX CUI";
       break;
     case IMAGE_SUBSYSTEM_WINDOWS_CE_GUI:
       subsystem_name = "Wince CUI";
       break;
     case IMAGE_SUBSYSTEM_EFI_APPLICATION:
       subsystem_name = "EFI application";
       break;
     case IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER:
       subsystem_name = "EFI boot service driver";
       break;
     case IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER:
       subsystem_name = "EFI runtime driver";
       break;
     // These are from revision 8.0 of the MS PE/COFF spec
     case IMAGE_SUBSYSTEM_EFI_ROM:
       subsystem_name = "EFI ROM";
       break;
     case IMAGE_SUBSYSTEM_XBOX:
       subsystem_name = "XBOX";
       break;
     // Added default case for clarity - subsystem_name is NULL anyway.
     default:
       subsystem_name = NULL;
     }
 
   fprintf (file, "Subsystem\t\t%08x", i->Subsystem);
   if (subsystem_name)
     fprintf (file, "\t(%s)", subsystem_name);
   fprintf (file, "\nDllCharacteristics\t%08x\n", i->DllCharacteristics);
   fprintf (file, "SizeOfStackReserve\t");
   fprintf_vma (file, i->SizeOfStackReserve);
   fprintf (file, "\nSizeOfStackCommit\t");
   fprintf_vma (file, i->SizeOfStackCommit);
   fprintf (file, "\nSizeOfHeapReserve\t");
   fprintf_vma (file, i->SizeOfHeapReserve);
   fprintf (file, "\nSizeOfHeapCommit\t");
   fprintf_vma (file, i->SizeOfHeapCommit);
   fprintf (file, "\nLoaderFlags\t\t%08lx\n", i->LoaderFlags);
   fprintf (file, "NumberOfRvaAndSizes\t%08lx\n", i->NumberOfRvaAndSizes);
 
   fprintf (file, "\nThe Data Directory\n");
   for (j = 0; j < IMAGE_NUMBEROF_DIRECTORY_ENTRIES; j++)
     {
       fprintf (file, "Entry %1x ", j);
       fprintf_vma (file, i->DataDirectory[j].VirtualAddress);
       fprintf (file, " %08lx ", i->DataDirectory[j].Size);
       fprintf (file, "%s\n", dir_names[j]);
     }
 
   pe_print_idata (abfd, vfile);
   pe_print_edata (abfd, vfile);
   pe_print_pdata (abfd, vfile);
   pe_print_reloc (abfd, vfile);
 
   return TRUE;
 }
 
 /* Copy any private info we understand from the input bfd
    to the output bfd.  */
 
 bfd_boolean
 _bfd_XX_bfd_copy_private_bfd_data_common (bfd * ibfd, bfd * obfd)
 {
   /* One day we may try to grok other private data.  */
   if (ibfd->xvec->flavour != bfd_target_coff_flavour
       || obfd->xvec->flavour != bfd_target_coff_flavour)
     return TRUE;
 
   pe_data (obfd)->pe_opthdr = pe_data (ibfd)->pe_opthdr;
   pe_data (obfd)->dll = pe_data (ibfd)->dll;
 
   /* For strip: if we removed .reloc, we'll make a real mess of things
      if we don't remove this entry as well.  */
   if (! pe_data (obfd)->has_reloc_section)
     {
       pe_data (obfd)->pe_opthdr.DataDirectory[PE_BASE_RELOCATION_TABLE].VirtualAddress = 0;
       pe_data (obfd)->pe_opthdr.DataDirectory[PE_BASE_RELOCATION_TABLE].Size = 0;
     }
   return TRUE;
 }
 
 /* Copy private section data.  */
 
 bfd_boolean
 _bfd_XX_bfd_copy_private_section_data (bfd *ibfd,
 				       asection *isec,
 				       bfd *obfd,
 				       asection *osec)
 {
   if (bfd_get_flavour (ibfd) != bfd_target_coff_flavour
       || bfd_get_flavour (obfd) != bfd_target_coff_flavour)
     return TRUE;
 
   if (coff_section_data (ibfd, isec) != NULL
       && pei_section_data (ibfd, isec) != NULL)
     {
       if (coff_section_data (obfd, osec) == NULL)
 	{
 	  bfd_size_type amt = sizeof (struct coff_section_tdata);
 	  osec->used_by_bfd = bfd_zalloc (obfd, amt);
 	  if (osec->used_by_bfd == NULL)
 	    return FALSE;
 	}
 
       if (pei_section_data (obfd, osec) == NULL)
 	{
 	  bfd_size_type amt = sizeof (struct pei_section_tdata);
 	  coff_section_data (obfd, osec)->tdata = bfd_zalloc (obfd, amt);
 	  if (coff_section_data (obfd, osec)->tdata == NULL)
 	    return FALSE;
 	}
 
       pei_section_data (obfd, osec)->virt_size =
 	pei_section_data (ibfd, isec)->virt_size;
       pei_section_data (obfd, osec)->pe_flags =
 	pei_section_data (ibfd, isec)->pe_flags;
     }
 
   return TRUE;
 }
 
 void
 _bfd_XX_get_symbol_info (bfd * abfd, asymbol *symbol, symbol_info *ret)
 {
   coff_get_symbol_info (abfd, symbol, ret);
 }
 
 /* Handle the .idata section and other things that need symbol table
    access.  */
 
 bfd_boolean
 _bfd_XXi_final_link_postscript (bfd * abfd, struct coff_final_link_info *pfinfo)
 {
   struct coff_link_hash_entry *h1;
   struct bfd_link_info *info = pfinfo->info;
   bfd_boolean result = TRUE;
 
   /* There are a few fields that need to be filled in now while we
      have symbol table access.
 
      The .idata subsections aren't directly available as sections, but
      they are in the symbol table, so get them from there.  */
 
   /* The import directory.  This is the address of .idata$2, with size
      of .idata$2 + .idata$3.  */
   h1 = coff_link_hash_lookup (coff_hash_table (info),
 			      ".idata$2", FALSE, FALSE, TRUE);
   if (h1 != NULL)
     {
       /* PR ld/2729: We cannot rely upon all the output sections having been 
 	 created properly, so check before referencing them.  Issue a warning
 	 message for any sections tht could not be found.  */
       if (h1->root.u.def.section != NULL
 	  && h1->root.u.def.section->output_section != NULL)
 	pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_TABLE].VirtualAddress =
 	  (h1->root.u.def.value
 	   + h1->root.u.def.section->output_section->vma
 	   + h1->root.u.def.section->output_offset);
       else
 	{
 	  _bfd_error_handler
 	    (_("%B: unable to fill in DataDictionary[1] because .idata$2 is missing"), 
 	     abfd);
 	  result = FALSE;
 	}
 
       h1 = coff_link_hash_lookup (coff_hash_table (info),
 				  ".idata$4", FALSE, FALSE, TRUE);
       if (h1 != NULL
 	  && h1->root.u.def.section != NULL
 	  && h1->root.u.def.section->output_section != NULL)
 	pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_TABLE].Size =
 	  ((h1->root.u.def.value
 	    + h1->root.u.def.section->output_section->vma
 	    + h1->root.u.def.section->output_offset)
 	   - pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_TABLE].VirtualAddress);
       else
 	{
 	  _bfd_error_handler
 	    (_("%B: unable to fill in DataDictionary[1] because .idata$4 is missing"), 
 	     abfd);
 	  result = FALSE;
 	}
 
       /* The import address table.  This is the size/address of
          .idata$5.  */
       h1 = coff_link_hash_lookup (coff_hash_table (info),
 				  ".idata$5", FALSE, FALSE, TRUE);
       if (h1 != NULL
 	  && h1->root.u.def.section != NULL
 	  && h1->root.u.def.section->output_section != NULL)
 	pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE].VirtualAddress =
 	  (h1->root.u.def.value
 	   + h1->root.u.def.section->output_section->vma
 	   + h1->root.u.def.section->output_offset);
       else
 	{
 	  _bfd_error_handler
 	    (_("%B: unable to fill in DataDictionary[12] because .idata$5 is missing"), 
 	     abfd);
 	  result = FALSE;
 	}
 
       h1 = coff_link_hash_lookup (coff_hash_table (info),
 				  ".idata$6", FALSE, FALSE, TRUE);
       if (h1 != NULL
 	  && h1->root.u.def.section != NULL
 	  && h1->root.u.def.section->output_section != NULL)
 	pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE].Size =
 	  ((h1->root.u.def.value
 	    + h1->root.u.def.section->output_section->vma
 	    + h1->root.u.def.section->output_offset)
 	   - pe_data (abfd)->pe_opthdr.DataDirectory[PE_IMPORT_ADDRESS_TABLE].VirtualAddress);      
       else
 	{
 	  _bfd_error_handler
 	    (_("%B: unable to fill in DataDictionary[PE_IMPORT_ADDRESS_TABLE (12)] because .idata$6 is missing"), 
 	     abfd);
 	  result = FALSE;
 	}
     }
 
   h1 = coff_link_hash_lookup (coff_hash_table (info),
 			      "__tls_used", FALSE, FALSE, TRUE);
   if (h1 != NULL)
     {
       if (h1->root.u.def.section != NULL
 	  && h1->root.u.def.section->output_section != NULL)
 	pe_data (abfd)->pe_opthdr.DataDirectory[PE_TLS_TABLE].VirtualAddress =
 	  (h1->root.u.def.value
 	   + h1->root.u.def.section->output_section->vma
 	   + h1->root.u.def.section->output_offset
 	   - pe_data (abfd)->pe_opthdr.ImageBase);
       else
 	{
 	  _bfd_error_handler
 	    (_("%B: unable to fill in DataDictionary[9] because __tls_used is missing"), 
 	     abfd);
 	  result = FALSE;
 	}
 
       pe_data (abfd)->pe_opthdr.DataDirectory[PE_TLS_TABLE].Size = 0x18;
     }
 
   /* If we couldn't find idata$2, we either have an excessively
      trivial program or are in DEEP trouble; we have to assume trivial
      program....  */
   return result;
 }
Index: projects/bsd_rdma_4_9/contrib/binutils
===================================================================
--- projects/bsd_rdma_4_9/contrib/binutils	(revision 326161)
+++ projects/bsd_rdma_4_9/contrib/binutils	(revision 326162)

Property changes on: projects/bsd_rdma_4_9/contrib/binutils
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/contrib/binutils:r325505-326161
Index: projects/bsd_rdma_4_9/share/examples/bhyve/vmrun.sh
===================================================================
--- projects/bsd_rdma_4_9/share/examples/bhyve/vmrun.sh	(revision 326161)
+++ projects/bsd_rdma_4_9/share/examples/bhyve/vmrun.sh	(revision 326162)
@@ -1,371 +1,376 @@
 #!/bin/sh
 #
 # Copyright (c) 2013 NetApp, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 # $FreeBSD$
 #
 
 LOADER=/usr/sbin/bhyveload
 BHYVECTL=/usr/sbin/bhyvectl
 FBSDRUN=/usr/sbin/bhyve
 
 DEFAULT_MEMSIZE=512M
 DEFAULT_CPUS=2
 DEFAULT_TAPDEV=tap0
 DEFAULT_CONSOLE=stdio
 
 DEFAULT_VIRTIO_DISK="./diskdev"
 DEFAULT_ISOFILE="./release.iso"
 
 errmsg() {
 	echo "*** $1"
 }
 
 usage() {
 	local msg=$1
 
-	echo "Usage: vmrun.sh [-aEhiTv] [-c <CPUs>] [-C <console>] [-d <disk file>]"
+	echo "Usage: vmrun.sh [-aAEhiTv] [-c <CPUs>] [-C <console>] [-d <disk file>]"
 	echo "                [-e <name=value>] [-f <path of firmware>] [-F <size>]"
 	echo "                [-g <gdbport> ] [-H <directory>]"
 	echo "                [-I <location of installation iso>] [-l <loader>]"
 	echo "                [-L <VNC IP for UEFI framebuffer>]"
 	echo "                [-m <memsize>] [-P <port>] [-t <tapdev>] <vmname>"
 	echo ""
 	echo "       -h: display this help message"
 	echo "       -a: force memory mapped local APIC access"
+	echo "       -A: use AHCI disk emulation instead of virtio"
 	echo "       -c: number of virtual cpus (default is ${DEFAULT_CPUS})"
 	echo "       -C: console device (default is ${DEFAULT_CONSOLE})"
 	echo "       -d: virtio diskdev file (default is ${DEFAULT_VIRTIO_DISK})"
 	echo "       -e: set FreeBSD loader environment variable"
 	echo "       -E: Use UEFI mode"
 	echo "       -f: Use a specific UEFI firmware"
 	echo "       -F: Use a custom UEFI GOP framebuffer size (default: w=1024,h=768)"
 	echo "       -g: listen for connection from kgdb at <gdbport>"
 	echo "       -H: host filesystem to export to the loader"
 	echo "       -i: force boot of the Installation CDROM image"
 	echo "       -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})"
 	echo "       -l: the OS loader to use (default is /boot/userboot.so)"
 	echo "       -L: IP address for UEFI GOP VNC server (default: 127.0.0.1)"
 	echo "       -m: memory size (default is ${DEFAULT_MEMSIZE})"
 	echo "       -p: pass-through a host PCI device at bus/slot/func (e.g. 10/0/0)"
 	echo "       -P: UEFI GOP VNC port (default: 5900)"
 	echo "       -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)"
 	echo "       -T: Enable tablet device (for UEFI GOP)"
 	echo "       -u: RTC keeps UTC time"
 	echo "       -v: Wait for VNC client connection before booting VM"
 	echo "       -w: ignore unimplemented MSRs"
 	echo ""
 	[ -n "$msg" ] && errmsg "$msg"
 	exit 1
 }
 
 if [ `id -u` -ne 0 ]; then
 	errmsg "This script must be executed with superuser privileges"
 	exit 1
 fi
 
 kldstat -n vmm > /dev/null 2>&1 
 if [ $? -ne 0 ]; then
 	errmsg "vmm.ko is not loaded"
 	exit 1
 fi
 
 force_install=0
 isofile=${DEFAULT_ISOFILE}
 memsize=${DEFAULT_MEMSIZE}
 console=${DEFAULT_CONSOLE}
 cpus=${DEFAULT_CPUS}
 tap_total=0
 disk_total=0
+disk_emulation="virtio-blk"
 gdbport=0
 loader_opt=""
 bhyverun_opt="-H -A -P"
 pass_total=0
 
 # EFI-specific options
 efi_mode=0
 efi_firmware="/usr/local/share/uefi-firmware/BHYVE_UEFI.fd"
 vncwait=""
 vnchost="127.0.0.1"
 vncport=5900
 fbsize="w=1024,h=768"
 tablet=""
 
-while getopts ac:C:d:e:Ef:F:g:hH:iI:l:m:p:P:t:Tuvw c ; do
+while getopts aAc:C:d:e:Ef:F:g:hH:iI:l:m:p:P:t:Tuvw c ; do
 	case $c in
 	a)
 		bhyverun_opt="${bhyverun_opt} -a"
 		;;
+	A)
+		disk_emulation="ahci-hd"
+		;;
 	c)
 		cpus=${OPTARG}
 		;;
 	C)
 		console=${OPTARG}
 		;;
 	d)
 		disk_dev=${OPTARG%%,*}
 		disk_opts=${OPTARG#${disk_dev}}
 		eval "disk_dev${disk_total}=\"${disk_dev}\""
 		eval "disk_opts${disk_total}=\"${disk_opts}\""
 		disk_total=$(($disk_total + 1))
 		;;
 	e)
 		loader_opt="${loader_opt} -e ${OPTARG}"
 		;;
 	E)
 		efi_mode=1
 		;;
 	f)
 		efi_firmware="${OPTARG}"
 		;;
 	F)
 		fbsize="${OPTARG}"
 		;;
 	g)	
 		gdbport=${OPTARG}
 		;;
 	H)
 		host_base=`realpath ${OPTARG}`
 		;;
 	i)
 		force_install=1
 		;;
 	I)
 		isofile=${OPTARG}
 		;;
 	l)
 		loader_opt="${loader_opt} -l ${OPTARG}"
 		;;
 	L)
 		vnchost="${OPTARG}"
 		;;
 	m)
 		memsize=${OPTARG}
 		;;
 	p)
 		eval "pass_dev${pass_total}=\"${OPTARG}\""
 		pass_total=$(($pass_total + 1))
 		;;
 	P)
 		vncport="${OPTARG}"
 		;;
 	t)
 		eval "tap_dev${tap_total}=\"${OPTARG}\""
 		tap_total=$(($tap_total + 1))
 		;;
 	T)
 		tablet="-s 30,xhci,tablet"
 		;;
 	u)	
 		bhyverun_opt="${bhyverun_opt} -u"
 		;;
 	v)
 		vncwait=",wait"
 		;;
 	w)
 		bhyverun_opt="${bhyverun_opt} -w"
 		;;
 	*)
 		usage
 		;;
 	esac
 done
 
 if [ $tap_total -eq 0 ] ; then
     tap_total=1
     tap_dev0="${DEFAULT_TAPDEV}"
 fi
 if [ $disk_total -eq 0 ] ; then
     disk_total=1
     disk_dev0="${DEFAULT_VIRTIO_DISK}"
 
 fi
 
 shift $((${OPTIND} - 1))
 
 if [ $# -ne 1 ]; then
 	usage "virtual machine name not specified"
 fi
 
 vmname="$1"
 if [ -n "${host_base}" ]; then
 	loader_opt="${loader_opt} -h ${host_base}"
 fi
 
 # If PCI passthru devices are configured then guest memory must be wired
 if [ ${pass_total} -gt 0 ]; then
 	loader_opt="${loader_opt} -S"
 	bhyverun_opt="${bhyverun_opt} -S"
 fi
 
 if [ ${efi_mode} -gt 0 ]; then
 	if [ ! -f ${efi_firmware} ]; then
 		echo "Error: EFI Firmware ${efi_firmware} doesn't exist. Try: pkg install uefi-edk2-bhyve"
 		exit 1
 	fi
 fi
 
 make_and_check_diskdev()
 {
     local virtio_diskdev="$1"
     # Create the virtio diskdev file if needed
     if [ ! -e ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" does not exist."
 	    echo "Creating it ..."
 	    truncate -s 8G ${virtio_diskdev} > /dev/null
     fi
 
     if [ ! -r ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" is not readable"
 	    exit 1
     fi
 
     if [ ! -w ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" is not writable"
 	    exit 1
     fi
 }
 
 echo "Launching virtual machine \"$vmname\" ..."
 
 first_diskdev="$disk_dev0"
 
 ${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1
 
 while [ 1 ]; do
 
 	file -s ${first_diskdev} | grep "boot sector" > /dev/null
 	rc=$?
 	if [ $rc -ne 0 ]; then
 		file -s ${first_diskdev} | grep ": Unix Fast File sys" > /dev/null
 		rc=$?
 	fi
 	if [ $rc -ne 0 ]; then
 		need_install=1
 	else
 		need_install=0
 	fi
 
 	if [ $force_install -eq 1 -o $need_install -eq 1 ]; then
 		if [ ! -r ${isofile} ]; then
 			echo -n "Installation CDROM image \"${isofile}\" "
 			echo    "is not readable"
 			exit 1
 		fi
 		BOOTDISKS="-d ${isofile}"
 		installer_opt="-s 31:0,ahci-cd,${isofile}"
 	else
 		BOOTDISKS=""
 		i=0
 		while [ $i -lt $disk_total ] ; do
 			eval "disk=\$disk_dev${i}"
 			if [ -r ${disk} ] ; then
 				BOOTDISKS="$BOOTDISKS -d ${disk} "
 			fi
 			i=$(($i + 1))
 		done
 		installer_opt=""
 	fi
 
 	if [ ${efi_mode} -eq 0 ]; then
 		${LOADER} -c ${console} -m ${memsize} ${BOOTDISKS} ${loader_opt} \
 			${vmname}
 		bhyve_exit=$?
 		if [ $bhyve_exit -ne 0 ]; then
 			break
 		fi
 	fi
 
 	#
 	# Build up args for additional tap and disk devices now.
 	#
 	nextslot=2  # slot 0 is hostbridge, slot 1 is lpc
 	devargs=""  # accumulate disk/tap args here
 	i=0
 	while [ $i -lt $tap_total ] ; do
 	    eval "tapname=\$tap_dev${i}"
 	    devargs="$devargs -s $nextslot:0,virtio-net,${tapname} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
 	done
 
 	i=0
 	while [ $i -lt $disk_total ] ; do
 	    eval "disk=\$disk_dev${i}"
 	    eval "opts=\$disk_opts${i}"
 	    make_and_check_diskdev "${disk}"
-	    devargs="$devargs -s $nextslot:0,virtio-blk,${disk}${opts} "
+	    devargs="$devargs -s $nextslot:0,$disk_emulation,${disk}${opts} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
 	done
 
 	i=0
 	while [ $i -lt $pass_total ] ; do
 	    eval "pass=\$pass_dev${i}"
 	    devargs="$devargs -s $nextslot:0,passthru,${pass} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
         done
 
 	efiargs=""
 	if [ ${efi_mode} -gt 0 ]; then
 		efiargs="-s 29,fbuf,tcp=${vnchost}:${vncport},${fbsize}${vncwait}"
 		efiargs="${efiargs} -l bootrom,${efi_firmware}"
 		efiargs="${efiargs} ${tablet}"
 	fi
 
 	${FBSDRUN} -c ${cpus} -m ${memsize} ${bhyverun_opt}		\
 		-g ${gdbport}						\
 		-s 0:0,hostbridge					\
 		-s 1:0,lpc						\
 		${efiargs}						\
 		${devargs}						\
 		-l com1,${console}					\
 		${installer_opt}					\
 		${vmname}
 
 	bhyve_exit=$?
 	# bhyve returns the following status codes:
 	#  0 - VM has been reset
 	#  1 - VM has been powered off
 	#  2 - VM has been halted
 	#  3 - VM generated a triple fault
 	#  all other non-zero status codes are errors
 	#
 	if [ $bhyve_exit -ne 0 ]; then
 		break
 	fi
 done
 
 
 case $bhyve_exit in
 	0|1|2)
 		# Cleanup /dev/vmm entry when bhyve did not exit
 		# due to an error.
 		${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1
 		;;
 esac
 
 exit $bhyve_exit
Index: projects/bsd_rdma_4_9/stand/common/load_elf.c
===================================================================
--- projects/bsd_rdma_4_9/stand/common/load_elf.c	(revision 326161)
+++ projects/bsd_rdma_4_9/stand/common/load_elf.c	(revision 326162)
@@ -1,1038 +1,1038 @@
 /*-
  * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
  * Copyright (c) 1998 Peter Wemm <peter@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/stdint.h>
 #include <string.h>
 #include <machine/elf.h>
 #include <stand.h>
 #define FREEBSD_ELF
 #include <link.h>
 
 #include "bootstrap.h"
 
 #define COPYOUT(s,d,l)	archsw.arch_copyout((vm_offset_t)(s), d, l)
 
 #if defined(__i386__) && __ELF_WORD_SIZE == 64
 #undef ELF_TARG_CLASS
 #undef ELF_TARG_MACH
 #define ELF_TARG_CLASS  ELFCLASS64
 #define ELF_TARG_MACH   EM_X86_64
 #endif
 
 typedef struct elf_file {
     Elf_Phdr 	*ph;
     Elf_Ehdr	*ehdr;
     Elf_Sym	*symtab;
     Elf_Hashelt	*hashtab;
     Elf_Hashelt	nbuckets;
     Elf_Hashelt	nchains;
     Elf_Hashelt	*buckets;
     Elf_Hashelt	*chains;
     Elf_Rel	*rel;
     size_t	relsz;
     Elf_Rela	*rela;
     size_t	relasz;
     char	*strtab;
     size_t	strsz;
     int		fd;
     caddr_t	firstpage;
     size_t	firstlen;
     int		kernel;
     u_int64_t	off;
 } *elf_file_t;
 
 static int __elfN(loadimage)(struct preloaded_file *mp, elf_file_t ef, u_int64_t loadaddr);
 static int __elfN(lookup_symbol)(struct preloaded_file *mp, elf_file_t ef, const char* name, Elf_Sym* sym);
 static int __elfN(reloc_ptr)(struct preloaded_file *mp, elf_file_t ef,
     Elf_Addr p, void *val, size_t len);
 static int __elfN(parse_modmetadata)(struct preloaded_file *mp, elf_file_t ef,
     Elf_Addr p_start, Elf_Addr p_end);
 static symaddr_fn __elfN(symaddr);
 static char	*fake_modname(const char *name);
 
 const char	*__elfN(kerneltype) = "elf kernel";
 const char	*__elfN(moduletype) = "elf module";
 
 u_int64_t	__elfN(relocation_offset) = 0;
 
 static int
 __elfN(load_elf_header)(char *filename, elf_file_t ef)
 {
 	ssize_t			 bytes_read;
 	Elf_Ehdr		*ehdr;
 	int 			 err;
 
 	/*
 	* Open the image, read and validate the ELF header 
 	*/
 	if (filename == NULL)	/* can't handle nameless */
 		return (EFTYPE);
 	if ((ef->fd = open(filename, O_RDONLY)) == -1)
 		return (errno);
 	ef->firstpage = malloc(PAGE_SIZE);
 	if (ef->firstpage == NULL) {
 		close(ef->fd);
 		return (ENOMEM);
 	}
 	bytes_read = read(ef->fd, ef->firstpage, PAGE_SIZE);
 	ef->firstlen = (size_t)bytes_read;
 	if (bytes_read < 0 || ef->firstlen <= sizeof(Elf_Ehdr)) {
 		err = EFTYPE; /* could be EIO, but may be small file */
 		goto error;
 	}
 	ehdr = ef->ehdr = (Elf_Ehdr *)ef->firstpage;
 
 	/* Is it ELF? */
 	if (!IS_ELF(*ehdr)) {
 		err = EFTYPE;
 		goto error;
 	}
 	if (ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || /* Layout ? */
 	    ehdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    ehdr->e_ident[EI_VERSION] != EV_CURRENT || /* Version ? */
 	    ehdr->e_version != EV_CURRENT ||
 	    ehdr->e_machine != ELF_TARG_MACH) { /* Machine ? */
 		err = EFTYPE;
 		goto error;
 	}
 
 	return (0);
 
 error:
 	if (ef->firstpage != NULL) {
 		free(ef->firstpage);
 		ef->firstpage = NULL;
 	}
 	if (ef->fd != -1) {
 		close(ef->fd);
 		ef->fd = -1;
 	}
 	return (err);
 }
 
 /*
  * Attempt to load the file (file) as an ELF module.  It will be stored at
  * (dest), and a pointer to a module structure describing the loaded object
  * will be saved in (result).
  */
 int
 __elfN(loadfile)(char *filename, u_int64_t dest, struct preloaded_file **result)
 {
 	return (__elfN(loadfile_raw)(filename, dest, result, 0));
 }
 
 int
 __elfN(loadfile_raw)(char *filename, u_int64_t dest,
     struct preloaded_file **result, int multiboot)
 {
     struct preloaded_file	*fp, *kfp;
     struct elf_file		ef;
     Elf_Ehdr 			*ehdr;
     int				err;
 
     fp = NULL;
     bzero(&ef, sizeof(struct elf_file));
     ef.fd = -1;
 
     err = __elfN(load_elf_header)(filename, &ef);
     if (err != 0)
     	return (err);
 
     ehdr = ef.ehdr;
 
     /*
      * Check to see what sort of module we are.
      */
     kfp = file_findfile(NULL, __elfN(kerneltype));
 #ifdef __powerpc__
     /*
      * Kernels can be ET_DYN, so just assume the first loaded object is the
      * kernel. This assumption will be checked later.
      */
     if (kfp == NULL)
         ef.kernel = 1;
 #endif
     if (ef.kernel || ehdr->e_type == ET_EXEC) {
 	/* Looks like a kernel */
 	if (kfp != NULL) {
 	    printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: kernel already loaded\n");
 	    err = EPERM;
 	    goto oerr;
 	}
 	/* 
 	 * Calculate destination address based on kernel entrypoint.
 	 *
 	 * For ARM, the destination address is independent of any values in the
 	 * elf header (an ARM kernel can be loaded at any 2MB boundary), so we
 	 * leave dest set to the value calculated by archsw.arch_loadaddr() and
 	 * passed in to this function.
 	 */
 #ifndef __arm__
         if (ehdr->e_type == ET_EXEC)
 	    dest = (ehdr->e_entry & ~PAGE_MASK);
 #endif
 	if ((ehdr->e_entry & ~PAGE_MASK) == 0) {
 	    printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: not a kernel (maybe static binary?)\n");
 	    err = EPERM;
 	    goto oerr;
 	}
 	ef.kernel = 1;
 
     } else if (ehdr->e_type == ET_DYN) {
 	/* Looks like a kld module */
 	if (multiboot != 0) {
 		printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: can't load module as multiboot\n");
 		err = EPERM;
 		goto oerr;
 	}
 	if (kfp == NULL) {
 	    printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: can't load module before kernel\n");
 	    err = EPERM;
 	    goto oerr;
 	}
 	if (strcmp(__elfN(kerneltype), kfp->f_type)) {
 	    printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: can't load module with kernel type '%s'\n", kfp->f_type);
 	    err = EPERM;
 	    goto oerr;
 	}
 	/* Looks OK, got ahead */
 	ef.kernel = 0;
 
     } else {
 	err = EFTYPE;
 	goto oerr;
     }
 
     if (archsw.arch_loadaddr != NULL)
 	dest = archsw.arch_loadaddr(LOAD_ELF, ehdr, dest);
     else
 	dest = roundup(dest, PAGE_SIZE);
 
     /* 
      * Ok, we think we should handle this.
      */
     fp = file_alloc();
     if (fp == NULL) {
 	    printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadfile: cannot allocate module info\n");
 	    err = EPERM;
 	    goto out;
     }
     if (ef.kernel == 1 && multiboot == 0)
 	setenv("kernelname", filename, 1);
     fp->f_name = strdup(filename);
     if (multiboot == 0)
     	fp->f_type = strdup(ef.kernel ?
     	    __elfN(kerneltype) : __elfN(moduletype));
     else
     	fp->f_type = strdup("elf multiboot kernel");
 
 #ifdef ELF_VERBOSE
     if (ef.kernel)
 	printf("%s entry at 0x%jx\n", filename, (uintmax_t)ehdr->e_entry);
 #else
     printf("%s ", filename);
 #endif
 
     fp->f_size = __elfN(loadimage)(fp, &ef, dest);
     if (fp->f_size == 0 || fp->f_addr == 0)
 	goto ioerr;
 
     /* save exec header as metadata */
     file_addmetadata(fp, MODINFOMD_ELFHDR, sizeof(*ehdr), ehdr);
 
     /* Load OK, return module pointer */
     *result = (struct preloaded_file *)fp;
     err = 0;
     goto out;
     
  ioerr:
     err = EIO;
  oerr:
     file_discard(fp);
  out:
     if (ef.firstpage)
 	free(ef.firstpage);
     if (ef.fd != -1)
     	close(ef.fd);
     return(err);
 }
 
 /*
  * With the file (fd) open on the image, and (ehdr) containing
  * the Elf header, load the image at (off)
  */
 static int
 __elfN(loadimage)(struct preloaded_file *fp, elf_file_t ef, u_int64_t off)
 {
     int 	i;
     u_int	j;
     Elf_Ehdr	*ehdr;
     Elf_Phdr	*phdr, *php;
     Elf_Shdr	*shdr;
     char	*shstr;
     int		ret;
     vm_offset_t firstaddr;
     vm_offset_t lastaddr;
     size_t	chunk;
     ssize_t	result;
     Elf_Addr	ssym, esym;
     Elf_Dyn	*dp;
     Elf_Addr	adp;
     Elf_Addr	ctors;
     int		ndp;
     int		symstrindex;
     int		symtabindex;
     Elf_Size	size;
     u_int	fpcopy;
     Elf_Sym	sym;
     Elf_Addr	p_start, p_end;
 
     dp = NULL;
     shdr = NULL;
     ret = 0;
     firstaddr = lastaddr = 0;
     ehdr = ef->ehdr;
     if (ehdr->e_type == ET_EXEC) {
 #if defined(__i386__) || defined(__amd64__)
 #if __ELF_WORD_SIZE == 64
 	off = - (off & 0xffffffffff000000ull);/* x86_64 relocates after locore */
 #else
 	off = - (off & 0xff000000u);	/* i386 relocates after locore */
 #endif
 #elif defined(__powerpc__)
 	/*
 	 * On the purely virtual memory machines like e500, the kernel is
 	 * linked against its final VA range, which is most often not
 	 * available at the loader stage, but only after kernel initializes
 	 * and completes its VM settings. In such cases we cannot use p_vaddr
 	 * field directly to load ELF segments, but put them at some
 	 * 'load-time' locations.
 	 */
 	if (off & 0xf0000000u) {
 	    off = -(off & 0xf0000000u);
 	    /*
 	     * XXX the physical load address should not be hardcoded. Note
 	     * that the Book-E kernel assumes that it's loaded at a 16MB
 	     * boundary for now...
 	     */
 	    off += 0x01000000;
 	    ehdr->e_entry += off;
 #ifdef ELF_VERBOSE
 	    printf("Converted entry 0x%08x\n", ehdr->e_entry);
 #endif
 	} else
 	    off = 0;
 #elif defined(__arm__) && !defined(EFI)
 	/*
 	 * The elf headers in arm kernels specify virtual addresses in all
 	 * header fields, even the ones that should be physical addresses.
 	 * We assume the entry point is in the first page, and masking the page
 	 * offset will leave us with the virtual address the kernel was linked
 	 * at.  We subtract that from the load offset, making 'off' into the
 	 * value which, when added to a virtual address in an elf header,
 	 * translates it to a physical address.  We do the va->pa conversion on
 	 * the entry point address in the header now, so that later we can
 	 * launch the kernel by just jumping to that address.
 	 *
 	 * When booting from UEFI the copyin and copyout functions handle
 	 * adjusting the location relative to the first virtual address.
 	 * Because of this there is no need to adjust the offset or entry
 	 * point address as these will both be handled by the efi code.
 	 */
 	off -= ehdr->e_entry & ~PAGE_MASK;
 	ehdr->e_entry += off;
 #ifdef ELF_VERBOSE
 	printf("ehdr->e_entry 0x%08x, va<->pa off %llx\n", ehdr->e_entry, off);
 #endif
 #else
 	off = 0;		/* other archs use direct mapped kernels */
 #endif
     }
     ef->off = off;
 
     if (ef->kernel)
 	__elfN(relocation_offset) = off;
 
     if ((ehdr->e_phoff + ehdr->e_phnum * sizeof(*phdr)) > ef->firstlen) {
 	printf("elf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: program header not within first page\n");
 	goto out;
     }
     phdr = (Elf_Phdr *)(ef->firstpage + ehdr->e_phoff);
 
     for (i = 0; i < ehdr->e_phnum; i++) {
 	/* We want to load PT_LOAD segments only.. */
 	if (phdr[i].p_type != PT_LOAD)
 	    continue;
 
 #ifdef ELF_VERBOSE
 	printf("Segment: 0x%lx@0x%lx -> 0x%lx-0x%lx",
 	    (long)phdr[i].p_filesz, (long)phdr[i].p_offset,
 	    (long)(phdr[i].p_vaddr + off),
 	    (long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1));
 #else
 	if ((phdr[i].p_flags & PF_W) == 0) {
 	    printf("text=0x%lx ", (long)phdr[i].p_filesz);
 	} else {
 	    printf("data=0x%lx", (long)phdr[i].p_filesz);
 	    if (phdr[i].p_filesz < phdr[i].p_memsz)
 		printf("+0x%lx", (long)(phdr[i].p_memsz -phdr[i].p_filesz));
 	    printf(" ");
 	}
 #endif
 	fpcopy = 0;
 	if (ef->firstlen > phdr[i].p_offset) {
 	    fpcopy = ef->firstlen - phdr[i].p_offset;
 	    archsw.arch_copyin(ef->firstpage + phdr[i].p_offset,
 			       phdr[i].p_vaddr + off, fpcopy);
 	}
 	if (phdr[i].p_filesz > fpcopy) {
 	    if (kern_pread(ef->fd, phdr[i].p_vaddr + off + fpcopy,
 		phdr[i].p_filesz - fpcopy, phdr[i].p_offset + fpcopy) != 0) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
 		    "_loadimage: read failed\n");
 		goto out;
 	    }
 	}
 	/* clear space from oversized segments; eg: bss */
 	if (phdr[i].p_filesz < phdr[i].p_memsz) {
 #ifdef ELF_VERBOSE
 	    printf(" (bss: 0x%lx-0x%lx)",
 		(long)(phdr[i].p_vaddr + off + phdr[i].p_filesz),
 		(long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1));
 #endif
 
 	    kern_bzero(phdr[i].p_vaddr + off + phdr[i].p_filesz,
 		phdr[i].p_memsz - phdr[i].p_filesz);
 	}
 #ifdef ELF_VERBOSE
 	printf("\n");
 #endif
 
 	if (archsw.arch_loadseg != NULL)
 	    archsw.arch_loadseg(ehdr, phdr + i, off);
 
 	if (firstaddr == 0 || firstaddr > (phdr[i].p_vaddr + off))
 	    firstaddr = phdr[i].p_vaddr + off;
 	if (lastaddr == 0 || lastaddr < (phdr[i].p_vaddr + off + phdr[i].p_memsz))
 	    lastaddr = phdr[i].p_vaddr + off + phdr[i].p_memsz;
     }
     lastaddr = roundup(lastaddr, sizeof(long));
 
     /*
      * Get the section headers.  We need this for finding the .ctors
      * section as well as for loading any symbols.  Both may be hard
      * to do if reading from a .gz file as it involves seeking.  I
      * think the rule is going to have to be that you must strip a
      * file to remove symbols before gzipping it.
      */
-    chunk = ehdr->e_shnum * ehdr->e_shentsize;
+    chunk = (size_t)ehdr->e_shnum * (size_t)ehdr->e_shentsize;
     if (chunk == 0 || ehdr->e_shoff == 0)
 	goto nosyms;
     shdr = alloc_pread(ef->fd, ehdr->e_shoff, chunk);
     if (shdr == NULL) {
 	printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
 	    "_loadimage: failed to read section headers");
 	goto nosyms;
     }
     file_addmetadata(fp, MODINFOMD_SHDR, chunk, shdr);
 
     /*
      * Read the section string table and look for the .ctors section.
      * We need to tell the kernel where it is so that it can call the
      * ctors.
      */
     chunk = shdr[ehdr->e_shstrndx].sh_size;
     if (chunk) {
 	shstr = alloc_pread(ef->fd, shdr[ehdr->e_shstrndx].sh_offset, chunk);
 	if (shstr) {
 	    for (i = 0; i < ehdr->e_shnum; i++) {
 		if (strcmp(shstr + shdr[i].sh_name, ".ctors") != 0)
 		    continue;
 		ctors = shdr[i].sh_addr;
 		file_addmetadata(fp, MODINFOMD_CTORS_ADDR, sizeof(ctors),
 		    &ctors);
 		size = shdr[i].sh_size;
 		file_addmetadata(fp, MODINFOMD_CTORS_SIZE, sizeof(size),
 		    &size);
 		break;
 	    }
 	    free(shstr);
 	}
     }
 
     /*
      * Now load any symbols.
      */
     symtabindex = -1;
     symstrindex = -1;
     for (i = 0; i < ehdr->e_shnum; i++) {
 	if (shdr[i].sh_type != SHT_SYMTAB)
 	    continue;
 	for (j = 0; j < ehdr->e_phnum; j++) {
 	    if (phdr[j].p_type != PT_LOAD)
 		continue;
 	    if (shdr[i].sh_offset >= phdr[j].p_offset &&
 		(shdr[i].sh_offset + shdr[i].sh_size <=
 		 phdr[j].p_offset + phdr[j].p_filesz)) {
 		shdr[i].sh_offset = 0;
 		shdr[i].sh_size = 0;
 		break;
 	    }
 	}
 	if (shdr[i].sh_offset == 0 || shdr[i].sh_size == 0)
 	    continue;		/* alread loaded in a PT_LOAD above */
 	/* Save it for loading below */
 	symtabindex = i;
 	symstrindex = shdr[i].sh_link;
     }
     if (symtabindex < 0 || symstrindex < 0)
 	goto nosyms;
 
     /* Ok, committed to a load. */
 #ifndef ELF_VERBOSE
     printf("syms=[");
 #endif
     ssym = lastaddr;
     for (i = symtabindex; i >= 0; i = symstrindex) {
 #ifdef ELF_VERBOSE
 	char	*secname;
 
 	switch(shdr[i].sh_type) {
 	    case SHT_SYMTAB:		/* Symbol table */
 		secname = "symtab";
 		break;
 	    case SHT_STRTAB:		/* String table */
 		secname = "strtab";
 		break;
 	    default:
 		secname = "WHOA!!";
 		break;
 	}
 #endif
 
 	size = shdr[i].sh_size;
 	archsw.arch_copyin(&size, lastaddr, sizeof(size));
 	lastaddr += sizeof(size);
 
 #ifdef ELF_VERBOSE
 	printf("\n%s: 0x%jx@0x%jx -> 0x%jx-0x%jx", secname,
 	    (uintmax_t)shdr[i].sh_size, (uintmax_t)shdr[i].sh_offset,
 	    (uintmax_t)lastaddr, (uintmax_t)(lastaddr + shdr[i].sh_size));
 #else
 	if (i == symstrindex)
 	    printf("+");
 	printf("0x%lx+0x%lx", (long)sizeof(size), (long)size);
 #endif
 
 	if (lseek(ef->fd, (off_t)shdr[i].sh_offset, SEEK_SET) == -1) {
 	    printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: could not seek for symbols - skipped!");
 	    lastaddr = ssym;
 	    ssym = 0;
 	    goto nosyms;
 	}
 	result = archsw.arch_readin(ef->fd, lastaddr, shdr[i].sh_size);
 	if (result < 0 || (size_t)result != shdr[i].sh_size) {
 	    printf("\nelf" __XSTRING(__ELF_WORD_SIZE) "_loadimage: could not read symbols - skipped! (%ju != %ju)", (uintmax_t)result,
 		(uintmax_t)shdr[i].sh_size);
 	    lastaddr = ssym;
 	    ssym = 0;
 	    goto nosyms;
 	}
 	/* Reset offsets relative to ssym */
 	lastaddr += shdr[i].sh_size;
 	lastaddr = roundup(lastaddr, sizeof(size));
 	if (i == symtabindex)
 	    symtabindex = -1;
 	else if (i == symstrindex)
 	    symstrindex = -1;
     }
     esym = lastaddr;
 #ifndef ELF_VERBOSE
     printf("]");
 #endif
 
     file_addmetadata(fp, MODINFOMD_SSYM, sizeof(ssym), &ssym);
     file_addmetadata(fp, MODINFOMD_ESYM, sizeof(esym), &esym);
 
 nosyms:
     printf("\n");
 
     ret = lastaddr - firstaddr;
     fp->f_addr = firstaddr;
 
     php = NULL;
     for (i = 0; i < ehdr->e_phnum; i++) {
 	if (phdr[i].p_type == PT_DYNAMIC) {
 	    php = phdr + i;
 	    adp = php->p_vaddr;
 	    file_addmetadata(fp, MODINFOMD_DYNAMIC, sizeof(adp), &adp);
 	    break;
 	}
     }
 
     if (php == NULL)	/* this is bad, we cannot get to symbols or _DYNAMIC */
 	goto out;
 
     ndp = php->p_filesz / sizeof(Elf_Dyn);
     if (ndp == 0)
 	goto out;
     dp = malloc(php->p_filesz);
     if (dp == NULL)
 	goto out;
     archsw.arch_copyout(php->p_vaddr + off, dp, php->p_filesz);
 
     ef->strsz = 0;
     for (i = 0; i < ndp; i++) {
 	if (dp[i].d_tag == 0)
 	    break;
 	switch (dp[i].d_tag) {
 	case DT_HASH:
 	    ef->hashtab = (Elf_Hashelt*)(uintptr_t)(dp[i].d_un.d_ptr + off);
 	    break;
 	case DT_STRTAB:
 	    ef->strtab = (char *)(uintptr_t)(dp[i].d_un.d_ptr + off);
 	    break;
 	case DT_STRSZ:
 	    ef->strsz = dp[i].d_un.d_val;
 	    break;
 	case DT_SYMTAB:
 	    ef->symtab = (Elf_Sym*)(uintptr_t)(dp[i].d_un.d_ptr + off);
 	    break;
 	case DT_REL:
 	    ef->rel = (Elf_Rel *)(uintptr_t)(dp[i].d_un.d_ptr + off);
 	    break;
 	case DT_RELSZ:
 	    ef->relsz = dp[i].d_un.d_val;
 	    break;
 	case DT_RELA:
 	    ef->rela = (Elf_Rela *)(uintptr_t)(dp[i].d_un.d_ptr + off);
 	    break;
 	case DT_RELASZ:
 	    ef->relasz = dp[i].d_un.d_val;
 	    break;
 	default:
 	    break;
 	}
     }
     if (ef->hashtab == NULL || ef->symtab == NULL ||
 	ef->strtab == NULL || ef->strsz == 0)
 	goto out;
     COPYOUT(ef->hashtab, &ef->nbuckets, sizeof(ef->nbuckets));
     COPYOUT(ef->hashtab + 1, &ef->nchains, sizeof(ef->nchains));
     ef->buckets = ef->hashtab + 2;
     ef->chains = ef->buckets + ef->nbuckets;
 
     if (__elfN(lookup_symbol)(fp, ef, "__start_set_modmetadata_set", &sym) != 0)
 	return 0;
     p_start = sym.st_value + ef->off;
     if (__elfN(lookup_symbol)(fp, ef, "__stop_set_modmetadata_set", &sym) != 0)
 	return ENOENT;
     p_end = sym.st_value + ef->off;
 
     if (__elfN(parse_modmetadata)(fp, ef, p_start, p_end) == 0)
 	goto out;
 
     if (ef->kernel)			/* kernel must not depend on anything */
 	goto out;
 
 out:
     if (dp)
 	free(dp);
     if (shdr)
 	free(shdr);
     return ret;
 }
 
 static char invalid_name[] = "bad";
 
 char *
 fake_modname(const char *name)
 {
     const char *sp, *ep;
     char *fp;
     size_t len;
 
     sp = strrchr(name, '/');
     if (sp)
 	sp++;
     else
 	sp = name;
     ep = strrchr(name, '.');
     if (ep) {
 	    if (ep == name) {
 		sp = invalid_name;
 		ep = invalid_name + sizeof(invalid_name) - 1;
 	    } 
     } else
 	ep = name + strlen(name);
     len = ep - sp;
     fp = malloc(len + 1);
     if (fp == NULL)
 	return NULL;
     memcpy(fp, sp, len);
     fp[len] = '\0';
     return fp;
 }
 
 #if (defined(__i386__) || defined(__powerpc__)) && __ELF_WORD_SIZE == 64
 struct mod_metadata64 {
 	int		md_version;	/* structure version MDTV_* */  
 	int		md_type;	/* type of entry MDT_* */
 	u_int64_t	md_data;	/* specific data */
 	u_int64_t	md_cval;	/* common string label */
 };
 #endif
 #if defined(__amd64__) && __ELF_WORD_SIZE == 32
 struct mod_metadata32 {
 	int		md_version;	/* structure version MDTV_* */  
 	int		md_type;	/* type of entry MDT_* */
 	u_int32_t	md_data;	/* specific data */
 	u_int32_t	md_cval;	/* common string label */
 };
 #endif
 
 int
 __elfN(load_modmetadata)(struct preloaded_file *fp, u_int64_t dest)
 {
 	struct elf_file		 ef;
 	int			 err, i, j;
 	Elf_Shdr		*sh_meta, *shdr = NULL;
 	Elf_Shdr		*sh_data[2];
 	char			*shstrtab = NULL;
 	size_t			 size;
 	Elf_Addr		 p_start, p_end;
 
 	bzero(&ef, sizeof(struct elf_file));
 	ef.fd = -1;
 
 	err = __elfN(load_elf_header)(fp->f_name, &ef);
 	if (err != 0)
 		goto out;
 
 	if (ef.kernel == 1 || ef.ehdr->e_type == ET_EXEC) {
 		ef.kernel = 1;
 	} else if (ef.ehdr->e_type != ET_DYN) {
 		err = EFTYPE;
 		goto out;
 	}
 
-	size = ef.ehdr->e_shnum * ef.ehdr->e_shentsize;
+	size = (size_t)ef.ehdr->e_shnum * (size_t)ef.ehdr->e_shentsize;
 	shdr = alloc_pread(ef.fd, ef.ehdr->e_shoff, size);
 	if (shdr == NULL) {
 		err = ENOMEM;
 		goto out;
 	}
 
 	/* Load shstrtab. */
 	shstrtab = alloc_pread(ef.fd, shdr[ef.ehdr->e_shstrndx].sh_offset,
 	    shdr[ef.ehdr->e_shstrndx].sh_size);
 	if (shstrtab == NULL) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
 		    "load_modmetadata: unable to load shstrtab\n");
 		err = EFTYPE;
 		goto out;
 	}
 
 	/* Find set_modmetadata_set and data sections. */
 	sh_data[0] = sh_data[1] = sh_meta = NULL;
 	for (i = 0, j = 0; i < ef.ehdr->e_shnum; i++) {
 		if (strcmp(&shstrtab[shdr[i].sh_name],
 		    "set_modmetadata_set") == 0) {
 			sh_meta = &shdr[i];
 		}
 		if ((strcmp(&shstrtab[shdr[i].sh_name], ".data") == 0) ||
 		    (strcmp(&shstrtab[shdr[i].sh_name], ".rodata") == 0)) {
 			sh_data[j++] = &shdr[i];
 		}
 	}
 	if (sh_meta == NULL || sh_data[0] == NULL || sh_data[1] == NULL) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
     "load_modmetadata: unable to find set_modmetadata_set or data sections\n");
 		err = EFTYPE;
 		goto out;
 	}
 
 	/* Load set_modmetadata_set into memory */
 	err = kern_pread(ef.fd, dest, sh_meta->sh_size, sh_meta->sh_offset);
 	if (err != 0) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
     "load_modmetadata: unable to load set_modmetadata_set: %d\n", err);
 		goto out;
 	}
 	p_start = dest;
 	p_end = dest + sh_meta->sh_size;
 	dest += sh_meta->sh_size;
 
 	/* Load data sections into memory. */
 	err = kern_pread(ef.fd, dest, sh_data[0]->sh_size,
 	    sh_data[0]->sh_offset);
 	if (err != 0) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
 		    "load_modmetadata: unable to load data: %d\n", err);
 		goto out;
 	}
 
 	/*
 	 * We have to increment the dest, so that the offset is the same into
 	 * both the .rodata and .data sections.
 	 */
 	ef.off = -(sh_data[0]->sh_addr - dest);
 	dest +=	(sh_data[1]->sh_addr - sh_data[0]->sh_addr);
 
 	err = kern_pread(ef.fd, dest, sh_data[1]->sh_size,
 	    sh_data[1]->sh_offset);
 	if (err != 0) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
 		    "load_modmetadata: unable to load data: %d\n", err);
 		goto out;
 	}
 
 	err = __elfN(parse_modmetadata)(fp, &ef, p_start, p_end);
 	if (err != 0) {
 		printf("\nelf" __XSTRING(__ELF_WORD_SIZE)
 		    "load_modmetadata: unable to parse metadata: %d\n", err);
 		goto out;
 	}
 
 out:
 	if (shstrtab != NULL)
 		free(shstrtab);
 	if (shdr != NULL)
 		free(shdr);
 	if (ef.firstpage != NULL)
 		free(ef.firstpage);
 	if (ef.fd != -1)
 		close(ef.fd);
 	return (err);
 }
 
 int
 __elfN(parse_modmetadata)(struct preloaded_file *fp, elf_file_t ef,
     Elf_Addr p_start, Elf_Addr p_end)
 {
     struct mod_metadata md;
 #if (defined(__i386__) || defined(__powerpc__)) && __ELF_WORD_SIZE == 64
     struct mod_metadata64 md64;
 #elif defined(__amd64__) && __ELF_WORD_SIZE == 32
     struct mod_metadata32 md32;
 #endif
     struct mod_depend *mdepend;
     struct mod_version mver;
     char *s;
     int error, modcnt, minfolen;
     Elf_Addr v, p;
 
     modcnt = 0;
     p = p_start;
     while (p < p_end) {
 	COPYOUT(p, &v, sizeof(v));
 	error = __elfN(reloc_ptr)(fp, ef, p, &v, sizeof(v));
 	if (error == EOPNOTSUPP)
 	    v += ef->off;
 	else if (error != 0)
 	    return (error);
 #if (defined(__i386__) || defined(__powerpc__)) && __ELF_WORD_SIZE == 64
 	COPYOUT(v, &md64, sizeof(md64));
 	error = __elfN(reloc_ptr)(fp, ef, v, &md64, sizeof(md64));
 	if (error == EOPNOTSUPP) {
 	    md64.md_cval += ef->off;
 	    md64.md_data += ef->off;
 	} else if (error != 0)
 	    return (error);
 	md.md_version = md64.md_version;
 	md.md_type = md64.md_type;
 	md.md_cval = (const char *)(uintptr_t)md64.md_cval;
 	md.md_data = (void *)(uintptr_t)md64.md_data;
 #elif defined(__amd64__) && __ELF_WORD_SIZE == 32
 	COPYOUT(v, &md32, sizeof(md32));
 	error = __elfN(reloc_ptr)(fp, ef, v, &md32, sizeof(md32));
 	if (error == EOPNOTSUPP) {
 	    md32.md_cval += ef->off;
 	    md32.md_data += ef->off;
 	} else if (error != 0)
 	    return (error);
 	md.md_version = md32.md_version;
 	md.md_type = md32.md_type;
 	md.md_cval = (const char *)(uintptr_t)md32.md_cval;
 	md.md_data = (void *)(uintptr_t)md32.md_data;
 #else
 	COPYOUT(v, &md, sizeof(md));
 	error = __elfN(reloc_ptr)(fp, ef, v, &md, sizeof(md));
 	if (error == EOPNOTSUPP) {
 	    md.md_cval += ef->off;
 	    md.md_data = (void *)((uintptr_t)md.md_data + (uintptr_t)ef->off);
 	} else if (error != 0)
 	    return (error);
 #endif
 	p += sizeof(Elf_Addr);
 	switch(md.md_type) {
 	  case MDT_DEPEND:
 	    if (ef->kernel)		/* kernel must not depend on anything */
 	      break;
 	    s = strdupout((vm_offset_t)md.md_cval);
 	    minfolen = sizeof(*mdepend) + strlen(s) + 1;
 	    mdepend = malloc(minfolen);
 	    if (mdepend == NULL)
 		return ENOMEM;
 	    COPYOUT((vm_offset_t)md.md_data, mdepend, sizeof(*mdepend));
 	    strcpy((char*)(mdepend + 1), s);
 	    free(s);
 	    file_addmetadata(fp, MODINFOMD_DEPLIST, minfolen, mdepend);
 	    free(mdepend);
 	    break;
 	  case MDT_VERSION:
 	    s = strdupout((vm_offset_t)md.md_cval);
 	    COPYOUT((vm_offset_t)md.md_data, &mver, sizeof(mver));
 	    file_addmodule(fp, s, mver.mv_version, NULL);
 	    free(s);
 	    modcnt++;
 	    break;
 	}
     }
     if (modcnt == 0) {
 	s = fake_modname(fp->f_name);
 	file_addmodule(fp, s, 1, NULL);
 	free(s);
     }
     return 0;
 }
 
 static unsigned long
 elf_hash(const char *name)
 {
     const unsigned char *p = (const unsigned char *) name;
     unsigned long h = 0;
     unsigned long g;
 
     while (*p != '\0') {
 	h = (h << 4) + *p++;
 	if ((g = h & 0xf0000000) != 0)
 	    h ^= g >> 24;
 	h &= ~g;
     }
     return h;
 }
 
 static const char __elfN(bad_symtable)[] = "elf" __XSTRING(__ELF_WORD_SIZE) "_lookup_symbol: corrupt symbol table\n";
 int
 __elfN(lookup_symbol)(struct preloaded_file *fp, elf_file_t ef, const char* name,
 		  Elf_Sym *symp)
 {
     Elf_Hashelt symnum;
     Elf_Sym sym;
     char *strp;
     unsigned long hash;
 
     hash = elf_hash(name);
     COPYOUT(&ef->buckets[hash % ef->nbuckets], &symnum, sizeof(symnum));
 
     while (symnum != STN_UNDEF) {
 	if (symnum >= ef->nchains) {
 	    printf(__elfN(bad_symtable));
 	    return ENOENT;
 	}
 
 	COPYOUT(ef->symtab + symnum, &sym, sizeof(sym));
 	if (sym.st_name == 0) {
 	    printf(__elfN(bad_symtable));
 	    return ENOENT;
 	}
 
 	strp = strdupout((vm_offset_t)(ef->strtab + sym.st_name));
 	if (strcmp(name, strp) == 0) {
 	    free(strp);
 	    if (sym.st_shndx != SHN_UNDEF ||
 		(sym.st_value != 0 &&
 		 ELF_ST_TYPE(sym.st_info) == STT_FUNC)) {
 		*symp = sym;
 		return 0;
 	    }
 	    return ENOENT;
 	}
 	free(strp);
 	COPYOUT(&ef->chains[symnum], &symnum, sizeof(symnum));
     }
     return ENOENT;
 }
 
 /*
  * Apply any intra-module relocations to the value. p is the load address
  * of the value and val/len is the value to be modified. This does NOT modify
  * the image in-place, because this is done by kern_linker later on.
  *
  * Returns EOPNOTSUPP if no relocation method is supplied.
  */
 static int
 __elfN(reloc_ptr)(struct preloaded_file *mp, elf_file_t ef,
     Elf_Addr p, void *val, size_t len)
 {
 	size_t n;
 	Elf_Rela a;
 	Elf_Rel r;
 	int error;
 
 	/*
 	 * The kernel is already relocated, but we still want to apply
 	 * offset adjustments.
 	 */
 	if (ef->kernel)
 		return (EOPNOTSUPP);
 
 	for (n = 0; n < ef->relsz / sizeof(r); n++) {
 		COPYOUT(ef->rel + n, &r, sizeof(r));
 
 		error = __elfN(reloc)(ef, __elfN(symaddr), &r, ELF_RELOC_REL,
 		    ef->off, p, val, len);
 		if (error != 0)
 			return (error);
 	}
 	for (n = 0; n < ef->relasz / sizeof(a); n++) {
 		COPYOUT(ef->rela + n, &a, sizeof(a));
 
 		error = __elfN(reloc)(ef, __elfN(symaddr), &a, ELF_RELOC_RELA,
 		    ef->off, p, val, len);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 static Elf_Addr
 __elfN(symaddr)(struct elf_file *ef, Elf_Size symidx)
 {
 
 	/* Symbol lookup by index not required here. */
 	return (0);
 }
Index: projects/bsd_rdma_4_9/stand/userboot/test/test.c
===================================================================
--- projects/bsd_rdma_4_9/stand/userboot/test/test.c	(revision 326161)
+++ projects/bsd_rdma_4_9/stand/userboot/test/test.c	(revision 326162)
@@ -1,475 +1,475 @@
 /*-
  * Copyright (c) 2011 Google, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/disk.h>
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <dirent.h>
 #include <dlfcn.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <termios.h>
 #include <unistd.h>
 
 #include <userboot.h>
 
 char *host_base = NULL;
 struct termios term, oldterm;
 char *image;
 size_t image_size;
 int disk_fd = -1;
 
 uint64_t regs[16];
 uint64_t pc;
 
 void test_exit(void *arg, int v);
 
 /*
  * Console i/o
  */
 
 void
 test_putc(void *arg, int ch)
 {
 	char c = ch;
 
 	write(1, &c, 1);
 }
 
 int
 test_getc(void *arg)
 {
 	char c;
 
 	if (read(0, &c, 1) == 1)
 		return c;
 	return -1;
 }
 
 int
 test_poll(void *arg)
 {
 	int n;
 
 	if (ioctl(0, FIONREAD, &n) >= 0)
 		return (n > 0);
 	return (0);
 }
 
 /*
  * Host filesystem i/o
  */
 
 struct test_file {
 	int tf_isdir;
 	size_t tf_size;
 	struct stat tf_stat;
 	union {
 		int fd;
 		DIR *dir;
 	} tf_u;
 };
 
 int
 test_open(void *arg, const char *filename, void **h_return)
 {
 	struct stat st;
 	struct test_file *tf;
 	char path[PATH_MAX];
 
 	if (!host_base)
 		return (ENOENT);
 
 	strlcpy(path, host_base, PATH_MAX);
 	if (path[strlen(path) - 1] == '/')
 		path[strlen(path) - 1] = 0;
 	strlcat(path, filename, PATH_MAX);
 	tf = malloc(sizeof(struct test_file));
 	if (stat(path, &tf->tf_stat) < 0) {
 		free(tf);
 		return (errno);
 	}
 
 	tf->tf_size = st.st_size;
 	if (S_ISDIR(tf->tf_stat.st_mode)) {
 		tf->tf_isdir = 1;
 		tf->tf_u.dir = opendir(path);
 		if (!tf->tf_u.dir)
 			goto out;
                 *h_return = tf;
 		return (0);
 	}
 	if (S_ISREG(tf->tf_stat.st_mode)) {
 		tf->tf_isdir = 0;
 		tf->tf_u.fd = open(path, O_RDONLY);
 		if (tf->tf_u.fd < 0)
 			goto out;
                 *h_return = tf;
 		return (0);
 	}
 
 out:
 	free(tf);
 	return (EINVAL);
 }
 
 int
 test_close(void *arg, void *h)
 {
 	struct test_file *tf = h;
 
 	if (tf->tf_isdir)
 		closedir(tf->tf_u.dir);
 	else
 		close(tf->tf_u.fd);
 	free(tf);
 
 	return (0);
 }
 
 int
 test_isdir(void *arg, void *h)
 {
 	struct test_file *tf = h;
 
 	return (tf->tf_isdir);
 }
 
 int
 test_read(void *arg, void *h, void *dst, size_t size, size_t *resid_return)
 {
 	struct test_file *tf = h;
 	ssize_t sz;
 
 	if (tf->tf_isdir)
 		return (EINVAL);
 	sz = read(tf->tf_u.fd, dst, size);
 	if (sz < 0)
 		return (EINVAL);
 	*resid_return = size - sz;
 	return (0);
 }
 
 int
 test_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
     size_t *namelen_return, char *name)
 {
 	struct test_file *tf = h;
 	struct dirent *dp;
 
 	if (!tf->tf_isdir)
 		return (EINVAL);
 
 	dp = readdir(tf->tf_u.dir);
 	if (!dp)
 		return (ENOENT);
 
 	/*
 	 * Note: d_namlen is in the range 0..255 and therefore less
 	 * than PATH_MAX so we don't need to test before copying.
 	 */
 	*fileno_return = dp->d_fileno;
 	*type_return = dp->d_type;
 	*namelen_return = dp->d_namlen;
 	memcpy(name, dp->d_name, dp->d_namlen);
 	name[dp->d_namlen] = 0;
 
 	return (0);
 }
 
 int
 test_seek(void *arg, void *h, uint64_t offset, int whence)
 {
 	struct test_file *tf = h;
 
 	if (tf->tf_isdir)
 		return (EINVAL);
 	if (lseek(tf->tf_u.fd, offset, whence) < 0)
 		return (errno);
 	return (0);
 }
 
 int
 test_stat(void *arg, void *h, int *mode_return, int *uid_return, int *gid_return,
     uint64_t *size_return)
 {
 	struct test_file *tf = h;
 
 	*mode_return = tf->tf_stat.st_mode;
 	*uid_return = tf->tf_stat.st_uid;
 	*gid_return = tf->tf_stat.st_gid;
 	*size_return = tf->tf_stat.st_size;
 	return (0);
 }
 
 /*
  * Disk image i/o
  */
 
 int
 test_diskread(void *arg, int unit, uint64_t offset, void *dst, size_t size,
     size_t *resid_return)
 {
 	ssize_t n;
 
 	if (unit != 0 || disk_fd == -1)
 		return (EIO);
 	n = pread(disk_fd, dst, size, offset);
 	if (n < 0)
 		return (errno);
 	*resid_return = size - n;
 	return (0);
 }
 
 int
 test_diskioctl(void *arg, int unit, u_long cmd, void *data)
 {
 	struct stat sb;
 
 	if (unit != 0 || disk_fd == -1)
 		return (EBADF);
 	switch (cmd) {
 	case DIOCGSECTORSIZE:
 		*(u_int *)data = 512;
 		break;
 	case DIOCGMEDIASIZE:
 		if (fstat(disk_fd, &sb) == 0)
 			*(off_t *)data = sb.st_size;
 		else
 			return (ENOTTY);
 		break;
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
 
 /*
  * Guest virtual machine i/o
  *
  * Note: guest addresses are kernel virtual
  */
 
 int
 test_copyin(void *arg, const void *from, uint64_t to, size_t size)
 {
 
 	to &= 0x7fffffff;
 	if (to > image_size)
 		return (EFAULT);
 	if (to + size > image_size)
 		size = image_size - to;
 	memcpy(&image[to], from, size);
 	return(0);
 }
 
 int
 test_copyout(void *arg, uint64_t from, void *to, size_t size)
 {
 
 	from &= 0x7fffffff;
 	if (from > image_size)
 		return (EFAULT);
 	if (from + size > image_size)
 		size = image_size - from;
 	memcpy(to, &image[from], size);
 	return(0);
 }
 
 void
 test_setreg(void *arg, int r, uint64_t v)
 {
 
 	if (r < 0 || r >= 16)
 		return;
 	regs[r] = v;
 }
 
 void
 test_setmsr(void *arg, int r, uint64_t v)
 {
 }
 
 void
 test_setcr(void *arg, int r, uint64_t v)
 {
 }
 
 void
 test_setgdt(void *arg, uint64_t v, size_t sz)
 {
 }
 
 void
 test_exec(void *arg, uint64_t pc)
 {
 	printf("Execute at 0x%"PRIu64"\n", pc);
 	test_exit(arg, 0);
 }
 
 /*
  * Misc
  */
 
 void
 test_delay(void *arg, int usec)
 {
 
 	usleep(usec);
 }
 
 void
 test_exit(void *arg, int v)
 {
 
 	tcsetattr(0, TCSAFLUSH, &oldterm);
 	exit(v);
 }
 
 void
 test_getmem(void *arg, uint64_t *lowmem, uint64_t *highmem)
 {
 
         *lowmem = 128*1024*1024;
         *highmem = 0;
 }
 
 const char *
 test_getenv(void *arg, int idx)
 {
 	static const char *vars[] = {
 		"foo=bar",
 		"bar=barbar",
 		NULL
 	};
 
 	return (vars[idx]);
 }
 
 struct loader_callbacks cb = {
 	.putc = test_putc,
 	.getc = test_getc,
 	.poll = test_poll,
 
 	.open = test_open,
 	.close = test_close,
 	.isdir = test_isdir,
 	.read = test_read,
 	.readdir = test_readdir,
 	.seek = test_seek,
 	.stat = test_stat,
 
 	.diskread = test_diskread,
 	.diskioctl = test_diskioctl,
 
 	.copyin = test_copyin,
 	.copyout = test_copyout,
 	.setreg = test_setreg,
 	.setmsr = test_setmsr,
 	.setcr = test_setcr,
         .setgdt = test_setgdt,
 	.exec = test_exec,
 
 	.delay = test_delay,
 	.exit = test_exit,
         .getmem = test_getmem,
 
 	.getenv = test_getenv,
 };
 
 void
 usage()
 {
 
 	printf("usage: [-b <userboot shared object>] [-d <disk image path>] [-h <host filesystem path>\n");
 	exit(1);
 }
 
 int
 main(int argc, char** argv)
 {
 	void *h;
-	void (*func)(struct loader_callbacks *, void *, int, int);
+	void (*func)(struct loader_callbacks *, void *, int, int) __dead2;
 	int opt;
 	char *disk_image = NULL;
 	const char *userboot_obj = "/boot/userboot.so";
 
 	while ((opt = getopt(argc, argv, "b:d:h:")) != -1) {
 		switch (opt) {
 		case 'b':
 			userboot_obj = optarg;
 			break;
 
 		case 'd':
 			disk_image = optarg;
 			break;
 
 		case 'h':
 			host_base = optarg;
 			break;
 
 		case '?':
 			usage();
 		}
 	}
 
 	h = dlopen(userboot_obj, RTLD_LOCAL);
 	if (!h) {
 		printf("%s\n", dlerror());
 		return (1);
 	}
 	func = dlsym(h, "loader_main");
 	if (!func) {
 		printf("%s\n", dlerror());
 		return (1);
 	}
 
 	image_size = 128*1024*1024;
 	image = malloc(image_size);
 	if (disk_image) {
 		disk_fd = open(disk_image, O_RDONLY);
 		if (disk_fd < 0)
 			err(1, "Can't open disk image '%s'", disk_image);
 	}
 
 	tcgetattr(0, &term);
 	oldterm = term;
 	term.c_iflag &= ~(ICRNL);
 	term.c_lflag &= ~(ICANON|ECHO);
 	tcsetattr(0, TCSAFLUSH, &term);
 
 	func(&cb, NULL, USERBOOT_VERSION_3, disk_fd >= 0);
 }
Index: projects/bsd_rdma_4_9/sys/amd64/amd64/machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/amd64/amd64/machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/amd64/amd64/machdep.c	(revision 326162)
@@ -1,2570 +1,2569 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <x86/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 #include <x86/init.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Preload data parse function */
 static caddr_t native_parse_preload_data(u_int64_t);
 
 /* Native function to fetch and parse the e820 map */
 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
 
 /* Default init_ops implementation. */
 struct init_ops init_ops = {
 	.parse_preload_data =	native_parse_preload_data,
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 #ifdef SMP
 	.mp_bootaddress =		mp_bootaddress,
 	.start_all_aps =		native_start_all_aps,
 #endif
 	.msi_init =			msi_init,
 };
 
 /*
  * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its value is
  * the physical address at which the kernel is loaded.
  */
 extern char kernphys[];
 
 struct msgbuf *msgbufp;
 
 /*
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
 vm_paddr_t efi_systbl_phys;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_gdt, r_idt;
 
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 void (*vmm_resume_p)(void);
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_cnt.v_free_count),
 	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	update_pcb_bases(pcb);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_ss = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
-	td->td_retval[1] = 0;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 struct amd64tss common_tss[MAXCPU];
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 void
 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t%016lx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%016lx\n", rdr0());
 	db_printf("dr1\t0x%016lx\n", rdr1());
 	db_printf("dr2\t0x%016lx\n", rdr2());
 	db_printf("dr3\t0x%016lx\n", rdr3());
 	db_printf("dr6\t0x%016lx\n", rdr6());
 	db_printf("dr7\t0x%016lx\n", rdr7());	
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 /*
  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  * and is only suitable for use at probe time.
  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  * It shouldn't be here.  There should probably be an APIC centric
  * implementation in the apic driver code, if at all.
  */
 intrmask_t
 isa_irq_pending(void)
 {
 	u_char irr1;
 	u_char irr2;
 
 	irr1 = inb(IO_ICU1);
 	irr2 = inb(IO_ICU2);
 	return ((irr2 << 8) | irr1);
 }
 #endif
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 *
 	 * NB: physmap_idx points to the next free slot.
 	 */
 	insert_idx = physmap_idx;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 void
 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
                       vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode",
 		"PersistentMemory"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type < nitems(types))
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_NV)
 				printf("NV ");
 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 				printf("MORE_RELIABLE ");
 			if (p->md_attr & EFI_MD_ATTR_RO)
 				printf("RO ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idx))
 			break;
 	}
 }
 
 static char bootmethod[16] = "";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 static void
 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap;
 	struct efi_map_header *efihdr;
 	u_int32_t size;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smap = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (efihdr == NULL && smap == NULL)
 		panic("No BIOS smap or EFI map info from loader!");
 
 	if (efihdr != NULL) {
 		add_efi_map_entries(efihdr, physmap, physmap_idx);
 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 	} else {
 		size = *((u_int32_t *)smap - 1);
 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 	}
 }
 
 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int page_counter;
 
 	bzero(physmap, sizeof(physmap));
 	physmap_idx = 0;
 
 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 	physmap_idx -= 2;
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] <= 0xA0000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0 || basemem > 640) {
 		if (bootverbose)
 			printf(
 		"Memory map doesn't contain a basemem segment, faking it");
 		basemem = 640;
 	}
 
 	/*
 	 * Make hole for "AP -> long mode" bootstrap code.  The
 	 * mp_bootaddress vector is only available when the kernel
 	 * is configured to support APs and APs for the system start
 	 * in 32bit mode (e.g. SMP bare metal).
 	 */
 	if (init_ops.mp_bootaddress) {
 		if (physmap[1] >= 0x100000000)
 			panic(
 	"Basemem segment is not suitable for AP bootstrap code!");
 		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmap[0] < physmem_start) {
 		if (physmem_start < PAGE_SIZE)
 			physmap[0] = PAGE_SIZE;
 		else if (physmem_start >= physmap[1])
 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 		else
 			physmap[0] = round_page(physmem_start);
 	}
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	page_counter = 0;
 	if (memtest != 0)
 		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * Print a "." every GB to show we're making
 			 * progress.
 			 */
 			page_counter++;
 			if ((page_counter % PAGES_PER_GB) == 0)
 				printf(".");
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	if (memtest != 0)
 		printf("\n");
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 static caddr_t
 native_parse_preload_data(u_int64_t modulep)
 {
 	caddr_t kmdp;
 	char *envp;
 #ifdef DDB
 	vm_offset_t ksym_start;
 	vm_offset_t ksym_end;
 #endif
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 	if (envp != NULL)
 		envp += KERNBASE;
 	init_static_kenv(envp, 0);
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end);
 #endif
 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	return (kmdp);
 }
 
 static void
 amd64_kdb_init(void)
 {
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct xstate_hdr *xhdr;
 	u_int64_t msr;
 	char *env;
 	size_t kstack0_sz;
 	int late_console;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	identify_cpu();
 	identify_hypervisor();
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	thread0.td_kstack = physfree + KERNBASE;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (long) gdt;
 	lgdt(&r_gdt);
 	pc = &__pcpu[0];
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	dpcpu_init((void *)(physfree + KERNBASE), 0);
 	physfree += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	/* Non-late cninit() and printf() can be moved up to here. */
 	PCPU_SET(tssp, &common_tss[0]);
 	PCPU_SET(commontssp, &common_tss[0]);
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	/*
 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 	 * transition).
 	 * Once bootblocks have updated, we can test directly for
 	 * efi_systbl != NULL here...
 	 */
 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 	    != NULL)
 		vty_set_preferred(VTY_VT);
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/* Set up the fast syscall stuff */
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
 	/*
 	 * Temporary forge some valid pointer to PCB, for exception
 	 * handlers.  It is reinitialized properly below after FPU is
 	 * set up.  Also set up td_critnest to short-cut the page
 	 * fault handler.
 	 */
 	cpu_max_ext_state_size = sizeof(struct savefpu);
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_critnest = 1;
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		amd64_kdb_init();
 	}
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	if (late_console)
 		cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?";
 #endif
 
 	if (late_console)
 		amd64_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
 	/*
 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
 	/* Ensure the stack is aligned to 16 bytes */
 	common_tss[0].tss_rsp0 &= ~0xFul;
 	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 
         env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 	thread0.td_critnest = 0;
 
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 
 static int
 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_map_header *efihdr;
 	caddr_t kmdp;
 	uint32_t efisize;
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL)
 		return (0);
 	efisize = *((uint32_t *)efihdr - 1);
 	return (SYSCTL_OUT(req, efihdr, efisize));
 }
 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 		critical_enter();
 	} else
 		td->td_md.md_spinlock_count++;
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		critical_exit();
 		intr_restore(flags);
 	}
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 
 	td->td_frame->tf_rip = addr;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 	}
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	update_pcb_bases(pcb);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 /*
  * The pcb_flags is only modified by current thread, or by other threads
  * when current thread is stopped.  However, current thread may change it
  * from the interrupt context in cpu_switch(), or in the trap handler.
  * When we read-modify-write pcb_flags from C sources, compiler may generate
  * code that is not atomic regarding the interrupt handler.  If a trap or
  * interrupt happens and any flag is modified from the handler, it can be
  * clobbered with the cached value later.  Therefore, we implement setting
  * and clearing flags with single-instruction functions, which do not race
  * with possible modification of the flags from the trap or interrupt context,
  * because traps and interrupts are executed only on instruction boundary.
  */
 void
 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("orl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 
 }
 
 /*
  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
  * pcb if user space modified the bases.  We must save on the context
  * switch or if the return to usermode happens through the doreti.
  *
  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
  * which have a consequence that the base MSRs must be saved each time
  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
  * context switches.
  */
 void
 set_pcb_flags(struct pcb *pcb, const u_int flags)
 {
 	register_t r;
 
 	if (curpcb == pcb &&
 	    (flags & PCB_FULL_IRET) != 0 &&
 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
 	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
 		r = intr_disable();
 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 			if (rfs() == _ufssel)
 				pcb->pcb_fsbase = rdfsbase();
 			if (rgs() == _ugssel)
 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 		}
 		set_pcb_flags_raw(pcb, flags);
 		intr_restore(r);
 	} else {
 		set_pcb_flags_raw(pcb, flags);
 	}
 }
 
 void
 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("andl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: projects/bsd_rdma_4_9/sys/amd64/ia32/ia32_signal.c
===================================================================
--- projects/bsd_rdma_4_9/sys/amd64/ia32/ia32_signal.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/amd64/ia32/ia32_signal.c	(revision 326162)
@@ -1,971 +1,970 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2003 Peter Wemm
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/ia32/ia32_signal.h>
 #include <machine/psl.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 #include <machine/frame.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/cpufunc.h>
 
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_ia32_sendsig(sig_t, ksiginfo_t *, sigset_t *);
 #endif
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void
 ia32_get_fpcontext(struct thread *td, struct ia32_mcontext *mcp,
     char *xfpusave, size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	/*
 	 * XXX Format of 64bit and 32bit FXSAVE areas differs. FXSAVE
 	 * in 32bit mode saves %cs and %ds, while on 64bit it saves
 	 * 64bit instruction and data pointers. Ignore the difference
 	 * for now, it should be irrelevant for most applications.
 	 */
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_IA32_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 ia32_set_fpcontext(struct thread *td, struct ia32_mcontext *mcp,
     char *xfpustate, size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 /*
  * Get machine context.
  */
 static int
 ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	/* Entry into kernel always sets TF_HASSEGS */
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_rdi;
 	mcp->mc_esi = tp->tf_rsi;
 	mcp->mc_ebp = tp->tf_rbp;
 	mcp->mc_isp = tp->tf_rsp;
 	mcp->mc_eflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_rax;
 		mcp->mc_edx = tp->tf_rdx;
 	}
 	mcp->mc_ebx = tp->tf_rbx;
 	mcp->mc_ecx = tp->tf_rcx;
 	mcp->mc_eip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	mcp->mc_flags = tp->tf_flags;
 	ia32_get_fpcontext(td, mcp, NULL, 0);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 static int
 ia32_set_mcontext(struct thread *td, struct ia32_mcontext *mcp)
 {
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	rflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_IA32_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = ia32_set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_gs = mcp->mc_gs;
 	tp->tf_fs = mcp->mc_fs;
 	tp->tf_es = mcp->mc_es;
 	tp->tf_ds = mcp->mc_ds;
 	tp->tf_flags = TF_HASSEGS;
 	tp->tf_rdi = mcp->mc_edi;
 	tp->tf_rsi = mcp->mc_esi;
 	tp->tf_rbp = mcp->mc_ebp;
 	tp->tf_rbx = mcp->mc_ebx;
 	tp->tf_rdx = mcp->mc_edx;
 	tp->tf_rcx = mcp->mc_ecx;
 	tp->tf_rax = mcp->mc_eax;
 	/* trapno, err */
 	tp->tf_rip = mcp->mc_eip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_esp;
 	tp->tf_ss = mcp->mc_ss;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /*
  * The first two fields of a ucontext_t are the signal mask and
  * the machine context.  The next field is uc_link; we want to
  * avoid destroying the link when copying out contexts.
  */
 #define	UC_COPY_SIZE	offsetof(struct ia32_ucontext, uc_link)
 
 int
 freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap)
 {
 	struct ia32_ucontext uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		bzero(&uc.__spare__, sizeof(uc.__spare__));
 		ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
 	}
 	return (ret);
 }
 
 int
 freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
 {
 	struct ia32_ucontext uc;
 	int ret;	
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = ia32_set_mcontext(td, &uc.uc_mcontext);
 			if (ret == 0) {
 				kern_sigprocmask(td, SIG_SETMASK,
 				    &uc.uc_sigmask, NULL, 0);
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 int
 freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
 {
 	struct ia32_ucontext uc;
 	int ret;	
 
 	if (uap->oucp == NULL || uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 			if (ret == 0) {
 				ret = ia32_set_mcontext(td, &uc.uc_mcontext);
 				if (ret == 0) {
 					kern_sigprocmask(td, SIG_SETMASK,
 					    &uc.uc_sigmask, NULL, 0);
 				}
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 
 #ifdef COMPAT_43
 static void
 ia32_osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct ia32_sigframe3 sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct ia32_sigframe3 *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(sf));
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	} else
 		fp = (struct ia32_sigframe3 *)regs->tf_rsp - 1;
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ah = (uintptr_t)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ah = (uintptr_t)catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_rax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_rbx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_rcx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_rdx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_rsi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_rdi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = regs->tf_gs;
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_rsp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_esp = regs->tf_rsp;
 	sf.sf_siginfo.si_sc.sc_ebp = regs->tf_rbp;
 	sf.sf_siginfo.si_sc.sc_eip = regs->tf_rip;
 	sf.sf_siginfo.si_sc.sc_eflags = regs->tf_rflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (uintptr_t)fp;
 	regs->tf_rip = p->p_sysent->sv_psstrings - sz_ia32_osigcode;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct ia32_sigframe4 sf, *sfp;
 	struct siginfo32 siginfo;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int oonstack;
 	int sig;
 
 	td = curthread;
 	p = td->td_proc;
 	siginfo_to_siginfo32(&ksi->ksi_info, &siginfo);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = siginfo.si_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
 	sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi;
 	sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi;
 	sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp;
 	sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */
 	sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx;
 	sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx;
 	sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx;
 	sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax;
 	sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno;
 	sf.sf_uc.uc_mcontext.mc_err = regs->tf_err;
 	sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip;
 	sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs;
 	sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags;
 	sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp;
 	sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss;
 	sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds;
 	sf.sf_uc.uc_mcontext.mc_es = regs->tf_es;
 	sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs;
 	sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs;
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct ia32_sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(sf));
 	} else
 		sfp = (struct ia32_sigframe4 *)regs->tf_rsp - 1;
 	PROC_UNLOCK(p);
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = siginfo;
 		sf.sf_si.si_signo = sig;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = siginfo.si_code;
 		sf.sf_addr = (u_int32_t)siginfo.si_addr;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (uintptr_t)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base + sz_ia32_sigcode -
 	    sz_freebsd4_ia32_sigcode;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	/* leave user %fs and %gs untouched */
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct ia32_sigframe sf, *sfp;
 	struct siginfo32 siginfo;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int oonstack;
 	int sig;
 
 	siginfo_to_siginfo32(&ksi->ksi_info, &siginfo);
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = siginfo.si_signo;
 	psp = p->p_sigacts;
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_ia32_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		ia32_osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
 	sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi;
 	sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi;
 	sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp;
 	sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */
 	sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx;
 	sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx;
 	sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx;
 	sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax;
 	sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno;
 	sf.sf_uc.uc_mcontext.mc_err = regs->tf_err;
 	sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip;
 	sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs;
 	sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags;
 	sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp;
 	sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss;
 	sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds;
 	sf.sf_uc.uc_mcontext.mc_es = regs->tf_es;
 	sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs;
 	sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs;
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase;
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig))
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 	else
 		sp = (char *)regs->tf_rsp;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(sf);
 	/* Align to 16 bytes. */
 	sfp = (struct ia32_sigframe *)((uintptr_t)sp & ~0xF);
 	PROC_UNLOCK(p);
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = siginfo;
 		sf.sf_si.si_signo = sig;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = siginfo.si_code;
 		sf.sf_addr = (u_int32_t)siginfo.si_addr;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    PTRIN(sf.sf_uc.uc_mcontext.mc_xfpustate), xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (uintptr_t)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	/* XXXKIB leave user %fs and %gs untouched */
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 
 #ifdef COMPAT_43
 int
 ofreebsd32_sigreturn(struct thread *td, struct ofreebsd32_sigreturn_args *uap)
 {
 	struct ia32_sigcontext3 sc, *scp;
 	struct trapframe *regs;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_eflags;
 	if (!EFL_SECURE(eflags, regs->tf_rflags)) {
 		return (EINVAL);
 	}
 	if (!CS_SECURE(scp->sc_cs)) {
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 	regs->tf_ds = scp->sc_ds;
 	regs->tf_es = scp->sc_es;
 	regs->tf_fs = scp->sc_fs;
 	regs->tf_gs = scp->sc_gs;
 
 	regs->tf_rax = scp->sc_eax;
 	regs->tf_rbx = scp->sc_ebx;
 	regs->tf_rcx = scp->sc_ecx;
 	regs->tf_rdx = scp->sc_edx;
 	regs->tf_rsi = scp->sc_esi;
 	regs->tf_rdi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_rbp = scp->sc_ebp;
 	regs->tf_rsp = scp->sc_esp;
 	regs->tf_rip = scp->sc_eip;
 	regs->tf_rflags = eflags;
 
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_freebsd32_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_freebsd32_sigreturn_args /* {
 		const struct freebsd4_freebsd32_ucontext *sigcntxp;
 	} */ *uap;
 {
 	struct ia32_ucontext4 uc;
 	struct trapframe *regs;
 	struct ia32_ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(eflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, eflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	regs->tf_rdi = ucp->uc_mcontext.mc_edi;
 	regs->tf_rsi = ucp->uc_mcontext.mc_esi;
 	regs->tf_rbp = ucp->uc_mcontext.mc_ebp;
 	regs->tf_rbx = ucp->uc_mcontext.mc_ebx;
 	regs->tf_rdx = ucp->uc_mcontext.mc_edx;
 	regs->tf_rcx = ucp->uc_mcontext.mc_ecx;
 	regs->tf_rax = ucp->uc_mcontext.mc_eax;
 	regs->tf_trapno = ucp->uc_mcontext.mc_trapno;
 	regs->tf_err = ucp->uc_mcontext.mc_err;
 	regs->tf_rip = ucp->uc_mcontext.mc_eip;
 	regs->tf_cs = cs;
 	regs->tf_rflags = ucp->uc_mcontext.mc_eflags;
 	regs->tf_rsp = ucp->uc_mcontext.mc_esp;
 	regs->tf_ss = ucp->uc_mcontext.mc_ss;
 	regs->tf_ds = ucp->uc_mcontext.mc_ds;
 	regs->tf_es = ucp->uc_mcontext.mc_es;
 	regs->tf_fs = ucp->uc_mcontext.mc_fs;
 	regs->tf_gs = ucp->uc_mcontext.mc_gs;
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 freebsd32_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd32_sigreturn_args /* {
 		const struct freebsd32_ucontext *sigcntxp;
 	} */ *uap;
 {
 	struct ia32_ucontext uc;
 	struct trapframe *regs;
 	struct ia32_ucontext *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(eflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, eflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((ucp->uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    td->td_proc->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate),
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    td->td_proc->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = ia32_set_fpcontext(td, &ucp->uc_mcontext, xfpustate,
 	    xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    td->td_proc->p_pid, td->td_name, ret);
 		return (ret);
 	}
 
 	regs->tf_rdi = ucp->uc_mcontext.mc_edi;
 	regs->tf_rsi = ucp->uc_mcontext.mc_esi;
 	regs->tf_rbp = ucp->uc_mcontext.mc_ebp;
 	regs->tf_rbx = ucp->uc_mcontext.mc_ebx;
 	regs->tf_rdx = ucp->uc_mcontext.mc_edx;
 	regs->tf_rcx = ucp->uc_mcontext.mc_ecx;
 	regs->tf_rax = ucp->uc_mcontext.mc_eax;
 	regs->tf_trapno = ucp->uc_mcontext.mc_trapno;
 	regs->tf_err = ucp->uc_mcontext.mc_err;
 	regs->tf_rip = ucp->uc_mcontext.mc_eip;
 	regs->tf_cs = cs;
 	regs->tf_rflags = ucp->uc_mcontext.mc_eflags;
 	regs->tf_rsp = ucp->uc_mcontext.mc_esp;
 	regs->tf_ss = ucp->uc_mcontext.mc_ss;
 	regs->tf_ds = ucp->uc_mcontext.mc_ds;
 	regs->tf_es = ucp->uc_mcontext.mc_es;
 	regs->tf_fs = ucp->uc_mcontext.mc_fs;
 	regs->tf_gs = ucp->uc_mcontext.mc_gs;
 	regs->tf_flags = TF_HASSEGS;
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 
 /*
  * Clear registers on exec
  */
 void
 ia32_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 #ifdef COMPAT_43
 	setup_lcall_gate();
 #endif
 
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW_I386__;
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = stack;
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucode32sel;
 	regs->tf_rbx = imgp->ps_strings;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 
 	fpstate_drop(td);
 
 	/* Return via doreti so that we can change to a different %cs */
 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
-	td->td_retval[1] = 0;
 }
Index: projects/bsd_rdma_4_9/sys/amd64/linux32/linux32_sysvec.c
===================================================================
--- projects/bsd_rdma_4_9/sys/amd64/linux32/linux32_sysvec.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/amd64/linux32/linux32_sysvec.c	(revision 326162)
@@ -1,1220 +1,1219 @@
 /*-
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2003 Peter Wemm
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 1998-1999 Andrew Gallatin
  * Copyright (c) 1994-1996 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_compat.h"
 
 #ifndef COMPAT_FREEBSD32
 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
 #endif
 
 #define	__ELF_WORD_SIZE	32
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/eventhandler.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_futex.h>
 #include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_util.h>
 #include <compat/linux/linux_vdso.h>
 
 MODULE_VERSION(linux, 1);
 
 #define	AUXARGS_ENTRY_32(pos, id, val)	\
 	do {				\
 		suword32(pos++, id);	\
 		suword32(pos++, val);	\
 	} while (0)
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define SHELLMAGIC      0x2123 /* #! */
 #else
 #define SHELLMAGIC      0x2321
 #endif
 
 /*
  * Allow the sendsig functions to use the ldebug() facility
  * even though they are not syscalls themselves. Map them
  * to syscall 0. This is slightly less bogus than using
  * ldebug(sigreturn).
  */
 #define	LINUX32_SYS_linux_rt_sendsig	0
 #define	LINUX32_SYS_linux_sendsig	0
 
 const char *linux_kplatform;
 static int linux_szsigcode;
 static vm_object_t linux_shared_page_obj;
 static char *linux_shared_page_mapping;
 extern char _binary_linux32_locore_o_start;
 extern char _binary_linux32_locore_o_end;
 
 extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL];
 
 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
 
 static int	elf_linux_fixup(register_t **stack_base,
 		    struct image_params *iparams);
 static register_t *linux_copyout_strings(struct image_params *imgp);
 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
 static void	exec_linux_setregs(struct thread *td, 
 				   struct image_params *imgp, u_long stack);
 static void	linux32_fixlimit(struct rlimit *rl, int which);
 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static void	linux_vdso_install(void *param);
 static void	linux_vdso_deinstall(void *param);
 
 /*
  * Linux syscalls return negative errno's, we do positive and map them
  * Reference:
  *   FreeBSD: src/sys/sys/errno.h
  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
  *            linux-2.6.17.8/include/asm-generic/errno.h
  */
 static int bsd_to_linux_errno[ELAST + 1] = {
 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
 	 -72, -67, -71
 };
 
 #define LINUX_T_UNKNOWN  255
 static int _bsd_to_linux_trapcode[] = {
 	LINUX_T_UNKNOWN,	/* 0 */
 	6,			/* 1  T_PRIVINFLT */
 	LINUX_T_UNKNOWN,	/* 2 */
 	3,			/* 3  T_BPTFLT */
 	LINUX_T_UNKNOWN,	/* 4 */
 	LINUX_T_UNKNOWN,	/* 5 */
 	16,			/* 6  T_ARITHTRAP */
 	254,			/* 7  T_ASTFLT */
 	LINUX_T_UNKNOWN,	/* 8 */
 	13,			/* 9  T_PROTFLT */
 	1,			/* 10 T_TRCTRAP */
 	LINUX_T_UNKNOWN,	/* 11 */
 	14,			/* 12 T_PAGEFLT */
 	LINUX_T_UNKNOWN,	/* 13 */
 	17,			/* 14 T_ALIGNFLT */
 	LINUX_T_UNKNOWN,	/* 15 */
 	LINUX_T_UNKNOWN,	/* 16 */
 	LINUX_T_UNKNOWN,	/* 17 */
 	0,			/* 18 T_DIVIDE */
 	2,			/* 19 T_NMI */
 	4,			/* 20 T_OFLOW */
 	5,			/* 21 T_BOUND */
 	7,			/* 22 T_DNA */
 	8,			/* 23 T_DOUBLEFLT */
 	9,			/* 24 T_FPOPFLT */
 	10,			/* 25 T_TSSFLT */
 	11,			/* 26 T_SEGNPFLT */
 	12,			/* 27 T_STKFLT */
 	18,			/* 28 T_MCHK */
 	19,			/* 29 T_XMMFLT */
 	15			/* 30 T_RESERVED */
 };
 #define bsd_to_linux_trapcode(code) \
     ((code)<nitems(_bsd_to_linux_trapcode)? \
      _bsd_to_linux_trapcode[(code)]: \
      LINUX_T_UNKNOWN)
 
 struct linux32_ps_strings {
 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
 	u_int ps_nargvstr;	/* the number of argument strings */
 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
 	u_int ps_nenvstr;	/* the number of environment strings */
 };
 
 LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
 LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
 LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
 LINUX_VDSO_SYM_CHAR(linux_platform);
 
 /*
  * If FreeBSD & Linux have a difference of opinion about what a trap
  * means, deal with it here.
  *
  * MPSAFE
  */
 static int
 translate_traps(int signal, int trap_code)
 {
 	if (signal != SIGBUS)
 		return signal;
 	switch (trap_code) {
 	case T_PROTFLT:
 	case T_TSSFLT:
 	case T_DOUBLEFLT:
 	case T_PAGEFLT:
 		return SIGSEGV;
 	default:
 		return signal;
 	}
 }
 
 static int
 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
 {
 	Elf32_Auxargs *args;
 	Elf32_Addr *base;
 	Elf32_Addr *pos;
 	struct linux32_ps_strings *arginfo;
 	int issetugid;
 
 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
 
 	KASSERT(curthread->td_proc == imgp->proc,
 	    ("unsafe elf_linux_fixup(), should be curproc"));
 	base = (Elf32_Addr *)*stack_base;
 	args = (Elf32_Auxargs *)imgp->auxargs;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0;
 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
 	    imgp->proc->p_sysent->sv_shared_page_base);
 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
 
 	/*
 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
 	 * as it has appeared in the 2.4.0-rc7 first time.
 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
 	 * is not present.
 	 * Also see linux_times() implementation.
 	 */
 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, issetugid);
 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
 	AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary));
 	if (imgp->execpathp != 0)
 		AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp));
 	if (args->execfd != -1)
 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword32(base, (uint32_t)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 static void
 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct l_rt_sigframe *fp, frame;
 	int oonstack;
 	int sig;
 	int code;
 	
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 #ifdef DEBUG
 	if (ldebug(rt_sendsig))
 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
 		    catcher, sig, (void*)mask, code);
 #endif
 	/*
 	 * Allocate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
 	} else
 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	sig = bsd_to_linux_signal(sig);
 
 	bzero(&frame, sizeof(frame));
 
 	frame.sf_handler = PTROUT(catcher);
 	frame.sf_sig = sig;
 	frame.sf_siginfo = PTROUT(&fp->sf_si);
 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
 
 	/* Fill in POSIX parts */
 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
 
 	/*
 	 * Build the signal context to be used by sigreturn
 	 * and libgcc unwind.
 	 */
 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
 
 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
 	PROC_UNLOCK(p);
 
 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
 
 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__mask;
 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
 	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
 
 #ifdef DEBUG
 	if (ldebug(rt_sendsig))
 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
 #endif
 
 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 #ifdef DEBUG
 		if (ldebug(rt_sendsig))
 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
 			    fp, oonstack);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.
 	 */
 	regs->tf_rsp = PTROUT(fp);
 	regs->tf_rip = linux32_rt_sigcode;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * in u. to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 static void
 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct l_sigframe *fp, frame;
 	l_sigset_t lmask;
 	int oonstack;
 	int sig, code;
 
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		linux_rt_sendsig(catcher, ksi, mask);
 		return;
 	}
 
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 #ifdef DEBUG
 	if (ldebug(sendsig))
 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
 		    catcher, sig, (void*)mask, code);
 #endif
 
 	/*
 	 * Allocate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
 	} else
 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	sig = bsd_to_linux_signal(sig);
 
 	bzero(&frame, sizeof(frame));
 
 	frame.sf_handler = PTROUT(catcher);
 	frame.sf_sig = sig;
 
 	bsd_to_linux_sigset(mask, &lmask);
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	frame.sf_sc.sc_mask   = lmask.__mask;
 	frame.sf_sc.sc_gs     = regs->tf_gs;
 	frame.sf_sc.sc_fs     = regs->tf_fs;
 	frame.sf_sc.sc_es     = regs->tf_es;
 	frame.sf_sc.sc_ds     = regs->tf_ds;
 	frame.sf_sc.sc_edi    = regs->tf_rdi;
 	frame.sf_sc.sc_esi    = regs->tf_rsi;
 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
 	frame.sf_sc.sc_esp    = regs->tf_rsp;
 	frame.sf_sc.sc_edx    = regs->tf_rdx;
 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
 	frame.sf_sc.sc_eax    = regs->tf_rax;
 	frame.sf_sc.sc_eip    = regs->tf_rip;
 	frame.sf_sc.sc_cs     = regs->tf_cs;
 	frame.sf_sc.sc_eflags = regs->tf_rflags;
 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
 	frame.sf_sc.sc_ss     = regs->tf_ss;
 	frame.sf_sc.sc_err    = regs->tf_err;
 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
 
 	frame.sf_extramask[0] = lmask.__mask;
 
 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.
 	 */
 	regs->tf_rsp = PTROUT(fp);
 	regs->tf_rip = linux32_sigcode;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * psl to gain improper privileges or to cause
  * a machine fault.
  */
 int
 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
 {
 	struct l_sigframe frame;
 	struct trapframe *regs;
 	sigset_t bmask;
 	l_sigset_t lmask;
 	int eflags;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 
 #ifdef DEBUG
 	if (ldebug(sigreturn))
 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
 #endif
 	/*
 	 * The trampoline code hands us the sigframe.
 	 * It is unsafe to keep track of it ourselves, in the event that a
 	 * program jumps out of a signal handler.
 	 */
 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
 		return (EFAULT);
 
 	/*
 	 * Check for security violations.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = frame.sf_sc.sc_eflags;
 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return(EINVAL);
 	}
 
 	lmask.__mask = frame.sf_sc.sc_mask;
 	lmask.__mask = frame.sf_extramask[0];
 	linux_to_bsd_sigset(&lmask, &bmask);
 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
 
 	/*
 	 * Restore signal context.
 	 */
 	regs->tf_rdi    = frame.sf_sc.sc_edi;
 	regs->tf_rsi    = frame.sf_sc.sc_esi;
 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
 	regs->tf_rdx    = frame.sf_sc.sc_edx;
 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
 	regs->tf_rax    = frame.sf_sc.sc_eax;
 	regs->tf_rip    = frame.sf_sc.sc_eip;
 	regs->tf_cs     = frame.sf_sc.sc_cs;
 	regs->tf_ds     = frame.sf_sc.sc_ds;
 	regs->tf_es     = frame.sf_sc.sc_es;
 	regs->tf_fs     = frame.sf_sc.sc_fs;
 	regs->tf_gs     = frame.sf_sc.sc_gs;
 	regs->tf_rflags = eflags;
 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
 	regs->tf_ss     = frame.sf_sc.sc_ss;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by rt_sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * psl to gain improper privileges or to cause
  * a machine fault.
  */
 int
 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
 {
 	struct l_ucontext uc;
 	struct l_sigcontext *context;
 	sigset_t bmask;
 	l_stack_t *lss;
 	stack_t ss;
 	struct trapframe *regs;
 	int eflags;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 
 #ifdef DEBUG
 	if (ldebug(rt_sigreturn))
 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
 #endif
 	/*
 	 * The trampoline code hands us the ucontext.
 	 * It is unsafe to keep track of it ourselves, in the event that a
 	 * program jumps out of a signal handler.
 	 */
 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
 		return (EFAULT);
 
 	context = &uc.uc_mcontext;
 
 	/*
 	 * Check for security violations.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = context->sc_eflags;
 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(context->sc_cs)) {
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return(EINVAL);
 	}
 
 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
 
 	/*
 	 * Restore signal context
 	 */
 	regs->tf_gs	= context->sc_gs;
 	regs->tf_fs	= context->sc_fs;
 	regs->tf_es	= context->sc_es;
 	regs->tf_ds	= context->sc_ds;
 	regs->tf_rdi    = context->sc_edi;
 	regs->tf_rsi    = context->sc_esi;
 	regs->tf_rbp    = context->sc_ebp;
 	regs->tf_rbx    = context->sc_ebx;
 	regs->tf_rdx    = context->sc_edx;
 	regs->tf_rcx    = context->sc_ecx;
 	regs->tf_rax    = context->sc_eax;
 	regs->tf_rip    = context->sc_eip;
 	regs->tf_cs     = context->sc_cs;
 	regs->tf_rflags = eflags;
 	regs->tf_rsp    = context->sc_esp_at_signal;
 	regs->tf_ss     = context->sc_ss;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 
 	/*
 	 * call sigaltstack & ignore results..
 	 */
 	lss = &uc.uc_stack;
 	ss.ss_sp = PTRIN(lss->ss_sp);
 	ss.ss_size = lss->ss_size;
 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
 
 #ifdef DEBUG
 	if (ldebug(rt_sigreturn))
 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
 #endif
 	(void)kern_sigaltstack(td, &ss, NULL);
 
 	return (EJUSTRETURN);
 }
 
 static int
 linux32_fetch_syscall_args(struct thread *td)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	struct syscall_args *sa;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	sa = &td->td_sa;
 
 	sa->args[0] = frame->tf_rbx;
 	sa->args[1] = frame->tf_rcx;
 	sa->args[2] = frame->tf_rdx;
 	sa->args[3] = frame->tf_rsi;
 	sa->args[4] = frame->tf_rdi;
 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
 	sa->code = frame->tf_rax;
 
 	if (sa->code >= p->p_sysent->sv_size)
 		/* nosys */
 		sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1];
 	else
 		sa->callp = &p->p_sysent->sv_table[sa->code];
 	sa->narg = sa->callp->sy_narg;
 
 	td->td_retval[0] = 0;
 	td->td_retval[1] = frame->tf_rdx;
 
 	return (0);
 }
 
 /*
  * If a linux binary is exec'ing something, try this image activator
  * first.  We override standard shell script execution in order to
  * be able to modify the interpreter path.  We only do this if a linux
  * binary is doing the exec, so we do not create an EXEC module for it.
  */
 static int	exec_linux_imgact_try(struct image_params *iparams);
 
 static int
 exec_linux_imgact_try(struct image_params *imgp)
 {
 	const char *head = (const char *)imgp->image_header;
 	char *rpath;
 	int error = -1;
 
 	/*
 	* The interpreter for shell scripts run from a linux binary needs
 	* to be located in /compat/linux if possible in order to recursively
 	* maintain linux path emulation.
 	*/
 	if (((const short *)head)[0] == SHELLMAGIC) {
 		/*
 		* Run our normal shell image activator.  If it succeeds attempt
 		* to use the alternate path for the interpreter.  If an
 		* alternate * path is found, use our stringspace to store it.
 		*/
 		if ((error = exec_shell_imgact(imgp)) == 0) {
 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
 			    AT_FDCWD);
 			if (rpath != NULL)
 				imgp->args->fname_buf =
 				    imgp->interpreter_name = rpath;
 		}
 	}
 	return (error);
 }
 
 /*
  * Clear registers on exec
  * XXX copied from ia32_signal.c.
  */
 static void
 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 
 	critical_enter();
 	wrmsr(MSR_FSBASE, 0);
 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	critical_exit();
 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = stack;
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_gs = _ugssel;
 	regs->tf_fs = _ufssel;
 	regs->tf_es = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_ss = _udatasel;
 	regs->tf_flags = TF_HASSEGS;
 	regs->tf_cs = _ucode32sel;
 	regs->tf_rbx = imgp->ps_strings;
 
 	fpstate_drop(td);
 
 	/* Do full restore on return so that we can change to a different %cs */
 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
-	td->td_retval[1] = 0;
 }
 
 /*
  * XXX copied from ia32_sysvec.c.
  */
 static register_t *
 linux_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc;
 	u_int32_t *vectp;
 	char *stringp, *destp;
 	u_int32_t *stack_base;
 	struct linux32_ps_strings *arginfo;
 	char canary[LINUX_AT_RANDOM_LEN];
 	size_t execpath_len;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 
 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
 	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
 	    roundup(sizeof(canary), sizeof(char *)) -
 	    roundup(execpath_len, sizeof(char *)) -
 	    roundup(ARG_MAX - imgp->args->stringspace, sizeof(char *));
 
 	if (execpath_len != 0) {
 		imgp->execpathp = (uintptr_t)arginfo - execpath_len;
 		copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	imgp->canary = (uintptr_t)arginfo -
 	    roundup(execpath_len, sizeof(char *)) -
 	    roundup(sizeof(canary), sizeof(char *));
 	copyout(canary, (void *)imgp->canary, sizeof(canary));
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (LINUX_AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size) *
 		    sizeof(u_int32_t));
 
 	} else
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2) * sizeof(u_int32_t));
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword32(vectp++, (uint32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword32(vectp++, 0);
 
 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword32(vectp++, (uint32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword32(vectp, 0);
 
 	return ((register_t *)stack_base);
 }
 
 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
     "32-bit Linux emulation");
 
 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
     &linux32_maxdsiz, 0, "");
 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
     &linux32_maxssiz, 0, "");
 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
     &linux32_maxvmem, 0, "");
 
 #if defined(DEBUG)
 SYSCTL_PROC(_compat_linux32, OID_AUTO, debug,
             CTLTYPE_STRING | CTLFLAG_RW,
             0, 0, linux_sysctl_debug, "A",
             "Linux debugging control");
 #endif
 
 static void
 linux32_fixlimit(struct rlimit *rl, int which)
 {
 
 	switch (which) {
 	case RLIMIT_DATA:
 		if (linux32_maxdsiz != 0) {
 			if (rl->rlim_cur > linux32_maxdsiz)
 				rl->rlim_cur = linux32_maxdsiz;
 			if (rl->rlim_max > linux32_maxdsiz)
 				rl->rlim_max = linux32_maxdsiz;
 		}
 		break;
 	case RLIMIT_STACK:
 		if (linux32_maxssiz != 0) {
 			if (rl->rlim_cur > linux32_maxssiz)
 				rl->rlim_cur = linux32_maxssiz;
 			if (rl->rlim_max > linux32_maxssiz)
 				rl->rlim_max = linux32_maxssiz;
 		}
 		break;
 	case RLIMIT_VMEM:
 		if (linux32_maxvmem != 0) {
 			if (rl->rlim_cur > linux32_maxvmem)
 				rl->rlim_cur = linux32_maxvmem;
 			if (rl->rlim_max > linux32_maxvmem)
 				rl->rlim_max = linux32_maxvmem;
 		}
 		break;
 	}
 }
 
 struct sysentvec elf_linux_sysvec = {
 	.sv_size	= LINUX32_SYS_MAXSYSCALL,
 	.sv_table	= linux32_sysent,
 	.sv_mask	= 0,
 	.sv_errsize	= ELAST + 1,
 	.sv_errtbl	= bsd_to_linux_errno,
 	.sv_transtrap	= translate_traps,
 	.sv_fixup	= elf_linux_fixup,
 	.sv_sendsig	= linux_sendsig,
 	.sv_sigcode	= &_binary_linux32_locore_o_start,
 	.sv_szsigcode	= &linux_szsigcode,
 	.sv_name	= "Linux ELF32",
 	.sv_coredump	= elf32_coredump,
 	.sv_imgact_try	= exec_linux_imgact_try,
 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
 	.sv_pagesize	= PAGE_SIZE,
 	.sv_minuser	= VM_MIN_ADDRESS,
 	.sv_maxuser	= LINUX32_MAXUSER,
 	.sv_usrstack	= LINUX32_USRSTACK,
 	.sv_psstrings	= LINUX32_PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings = linux_copyout_strings,
 	.sv_setregs	= exec_linux_setregs,
 	.sv_fixlimit	= linux32_fixlimit,
 	.sv_maxssiz	= &linux32_maxssiz,
 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
 	.sv_syscallnames = NULL,
 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
 	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= linux_schedtail,
 	.sv_thread_detach = linux_thread_detach,
 	.sv_trap	= NULL,	
 };
 
 static void
 linux_vdso_install(void *param)
 {
 
 	linux_szsigcode = (&_binary_linux32_locore_o_end - 
 	    &_binary_linux32_locore_o_start);
 
 	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
 		panic("Linux invalid vdso size\n");
 
 	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
 
 	linux_shared_page_obj = __elfN(linux_shared_page_init)
 	    (&linux_shared_page_mapping);
 
 	__elfN(linux_vdso_reloc)(&elf_linux_sysvec);
 
 	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
 	    linux_szsigcode);
 	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
 
 	linux_kplatform = linux_shared_page_mapping +
 	    (linux_platform - (caddr_t)elf_linux_sysvec.sv_shared_page_base);
 }
 SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
     (sysinit_cfunc_t)linux_vdso_install, NULL);
 
 static void
 linux_vdso_deinstall(void *param)
 {
 
 	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
 };
 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
     (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
 
 static char GNU_ABI_VENDOR[] = "GNU";
 static int GNULINUX_ABI_DESC = 0;
 
 static boolean_t
 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
 {
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
 
 	desc = (const Elf32_Word *)p;
 	if (desc[0] != GNULINUX_ABI_DESC)
 		return (FALSE);
 
 	/*
 	 * For linux we encode osrel as follows (see linux_mib.c):
 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
 	 */
 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
 
 	return (TRUE);
 }
 
 static Elf_Brandnote linux32_brandnote = {
 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
 	.vendor		= GNU_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= linux32_trans_osrel
 };
 
 static Elf32_Brandinfo linux_brand = {
 	.brand		= ELFOSABI_LINUX,
 	.machine	= EM_386,
 	.compat_3_brand	= "Linux",
 	.emul_path	= "/compat/linux",
 	.interp_path	= "/lib/ld-linux.so.1",
 	.sysvec		= &elf_linux_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &linux32_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
 static Elf32_Brandinfo linux_glibc2brand = {
 	.brand		= ELFOSABI_LINUX,
 	.machine	= EM_386,
 	.compat_3_brand	= "Linux",
 	.emul_path	= "/compat/linux",
 	.interp_path	= "/lib/ld-linux.so.2",
 	.sysvec		= &elf_linux_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &linux32_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
 static Elf32_Brandinfo linux_muslbrand = {
 	.brand		= ELFOSABI_LINUX,
 	.machine	= EM_386,
 	.compat_3_brand	= "Linux",
 	.emul_path	= "/compat/linux",
 	.interp_path	= "/lib/ld-musl-i386.so.1",
 	.sysvec		= &elf_linux_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &linux32_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
 Elf32_Brandinfo *linux_brandlist[] = {
 	&linux_brand,
 	&linux_glibc2brand,
 	&linux_muslbrand,
 	NULL
 };
 
 static int
 linux_elf_modevent(module_t mod, int type, void *data)
 {
 	Elf32_Brandinfo **brandinfo;
 	int error;
 	struct linux_ioctl_handler **lihp;
 
 	error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		     ++brandinfo)
 			if (elf32_insert_brand_entry(*brandinfo) < 0)
 				error = EINVAL;
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_register_handler(*lihp);
 			LIST_INIT(&futex_list);
 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
 			stclohz = (stathz ? stathz : hz);
 			if (bootverbose)
 				printf("Linux ELF exec handler installed\n");
 		} else
 			printf("cannot insert Linux ELF brand handler\n");
 		break;
 	case MOD_UNLOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		     ++brandinfo)
 			if (elf32_brand_inuse(*brandinfo))
 				error = EBUSY;
 		if (error == 0) {
 			for (brandinfo = &linux_brandlist[0];
 			     *brandinfo != NULL; ++brandinfo)
 				if (elf32_remove_brand_entry(*brandinfo) < 0)
 					error = EINVAL;
 		}
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_unregister_handler(*lihp);
 			mtx_destroy(&futex_mtx);
 			if (bootverbose)
 				printf("Linux ELF exec handler removed\n");
 		} else
 			printf("Could not deinstall ELF interpreter entry\n");
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (error);
 }
 
 static moduledata_t linux_elf_mod = {
 	"linuxelf",
 	linux_elf_modevent,
 	0
 };
 
 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
 MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1);
 FEATURE(linux, "Linux 32bit support");
Index: projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_hw.c
===================================================================
--- projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_hw.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_hw.c	(revision 326162)
@@ -1,1509 +1,1507 @@
 /*-
  * Copyright (c) 2016, Anish Gupta (anish@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <machine/resource.h>
 #include <machine/vmm.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pci_cfgreg.h>
 
 #include "pcib_if.h"
 
 #include "io/iommu.h"
 #include "amdvi_priv.h"
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL);
 
 #define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s)))
 #define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s)))
 
 /* Print RID or device ID in PCI string format. */
 #define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d)
 
 static void amdvi_dump_cmds(struct amdvi_softc *softc);
 static void amdvi_print_dev_cap(struct amdvi_softc *softc);
 
 MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi");
 
 extern device_t *ivhd_devs;
 
 extern int ivhd_count;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count,
     0, NULL);
 
 static int amdvi_enable_user = 0;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN,
     &amdvi_enable_user, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user);
 
 #ifdef AMDVI_ATS_ENABLE
 /* XXX: ATS is not tested. */
 static int amdvi_enable_iotlb = 1;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN,
     &amdvi_enable_iotlb, 0, NULL);
 TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb);
 #endif
 
 static int amdvi_host_ptp = 1;	/* Use page tables for host. */
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN,
     &amdvi_host_ptp, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp);
 
 /* Page table level used <= supported by h/w[v1=7]. */
 static int amdvi_ptp_level = 4;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN,
     &amdvi_ptp_level, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level);
 
 /* Disable fault event reporting. */
 static int amdvi_disable_io_fault = 0;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN,
     &amdvi_disable_io_fault, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault);
 
 static uint32_t amdvi_dom_id = 0;	/* 0 is reserved for host. */
 SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD,
     &amdvi_dom_id, 0, NULL);
 /*
  * Device table entry.
  * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes).
  *	= 256 * 2 * PAGE_SIZE.
  */
 static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE);
 CTASSERT(PCI_NUM_DEV_MAX == 0x10000);
 CTASSERT(sizeof(amdvi_dte) == 0x200000);
 
 static SLIST_HEAD (, amdvi_domain) dom_head;
 
 static inline void
 amdvi_pci_write(struct amdvi_softc *softc, int off, uint32_t data)
 {
 
 	pci_cfgregwrite(PCI_RID2BUS(softc->pci_rid),
 	    PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid),
 	    off, data, 4);
 }
 
 static inline uint32_t
 amdvi_pci_read(struct amdvi_softc *softc, int off)
 {
 
 	return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid),
 	    PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid),
 	    off, 4));
 }
 
 static int
 amdvi_find_pci_cap(struct amdvi_softc *softc, uint8_t capability, int *off)
 {
 	uint32_t read;
 	uint8_t ptr;
 
 	read = amdvi_pci_read(softc, PCIR_COMMAND);
 	if (((read >> 16) & PCIM_STATUS_CAPPRESENT) == 0)
 		return (ENXIO);
 
 	/* Read the starting of capability pointer. */
 	read = amdvi_pci_read(softc, PCIR_CAP_PTR);
 	ptr = read & 0xFF;
 
 	while (ptr != 0) {
 		read = amdvi_pci_read(softc, ptr);
 		if ((read & 0xFF) == capability) {
 			*off = ptr;
 			return (0);
 		}
 		ptr = (read >> 8) & 0xFF;
 	}
 
 	return (ENOENT);
 }
 
 #ifdef AMDVI_ATS_ENABLE
 /* XXX: Should be in pci.c */
 /*
  * Check if device has ATS capability and its enabled.
  * If ATS is absent or disabled, return (-1), otherwise ATS
  * queue length.
  */
 static int
 amdvi_find_ats_qlen(uint16_t devid)
 {
 	device_t dev;
 	uint32_t off, cap;
 	int qlen = -1;
 
 	dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid),
 			   PCI_RID2FUNC(devid));
 
 	if (!dev) {
 		return (-1);
 	}
 #define PCIM_ATS_EN	BIT(31)
 
 	if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) {
 		cap = pci_read_config(dev, off + 4, 4);
 		qlen = (cap & 0x1F);
 		qlen = qlen ? qlen : 32;
 		printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n",
 		       RID2PCI_STR(devid),
 		       (cap & PCIM_ATS_EN) ? "enabled" : "Disabled",
 		       qlen);
 		qlen = (cap & PCIM_ATS_EN) ? qlen : -1;
 	}
 
 	return (qlen);
 }
 
 /*
  * Check if an endpoint device support device IOTLB or ATS.
  */
 static inline bool
 amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid)
 {
 	struct ivhd_dev_cfg *cfg;
 	int qlen, i;
 	bool pci_ats, ivhd_ats;
 
 	qlen = amdvi_find_ats_qlen(devid);
 	if (qlen < 0)
 		return (false);
 
 	KASSERT(softc, ("softc is NULL"));
 	cfg = softc->dev_cfg;
 
 	ivhd_ats = false;
 	for (i = 0; i < softc->dev_cfg_cnt; i++) {
 		if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) {
 			ivhd_ats = cfg->enable_ats;
 			break;
 		}
 		cfg++;
 	}
 
 	pci_ats = (qlen < 0) ? false : true;
 	if (pci_ats != ivhd_ats)
 		device_printf(softc->dev,
 		    "BIOS bug: mismatch in ATS setting for %d.%d.%d,"
 		    "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen);
 
 	/* Ignore IVRS setting and respect PCI setting. */
 	return (pci_ats);
 }
 #endif
 
 /* Enable IOTLB support for IOMMU if its supported. */
 static inline void
 amdvi_hw_enable_iotlb(struct amdvi_softc *softc)
 {
 #ifndef AMDVI_ATS_ENABLE
 	softc->iotlb = false;
 #else
 	bool supported;
 
 	supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false;
 
 	if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) {
 		if (!supported)
 			device_printf(softc->dev, "IOTLB disabled by BIOS.\n");
 
 		if (supported && !amdvi_enable_iotlb) {
 			device_printf(softc->dev, "IOTLB disabled by user.\n");
 			supported = false;
 		}
 	} else
 		supported = false;
 
 	softc->iotlb = supported;
 
 #endif
 }
 
 static int
 amdvi_init_cmd(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl = softc->ctrl;
 
 	ctrl->cmd.len = 8;	/* Use 256 command buffer entries. */
 	softc->cmd_max = 1 << ctrl->cmd.len;
 
 	softc->cmd = malloc(sizeof(struct amdvi_cmd) *
 	    softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO);
 
 	if ((uintptr_t)softc->cmd & PAGE_MASK)
 		panic("AMDVi: Command buffer not aligned on page boundary.");
 
 	ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE;
 	/*
 	 * XXX: Reset the h/w pointers in case IOMMU is restarting,
 	 * h/w doesn't clear these pointers based on empirical data.
 	 */
 	ctrl->cmd_tail = 0;
 	ctrl->cmd_head = 0;
 
 	return (0);
 }
 
 /*
  * Note: Update tail pointer after we have written the command since tail
  * pointer update cause h/w to execute new commands, see section 3.3
  * of AMD IOMMU spec ver 2.0.
  */
 /* Get the command tail pointer w/o updating it. */
 static struct amdvi_cmd *
 amdvi_get_cmd_tail(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_cmd *tail;
 
 	KASSERT(softc, ("softc is NULL"));
 	KASSERT(softc->cmd != NULL, ("cmd is NULL"));
 
 	ctrl = softc->ctrl;
 	KASSERT(ctrl != NULL, ("ctrl is NULL"));
 
 	tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd +
 	    ctrl->cmd_tail);
 
 	return (tail);
 }
 
 /*
  * Update the command tail pointer which will start command execution.
  */
 static void
 amdvi_update_cmd_tail(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	int size;
 
 	size = sizeof(struct amdvi_cmd);
 	KASSERT(softc->cmd != NULL, ("cmd is NULL"));
 
 	ctrl = softc->ctrl;
 	KASSERT(ctrl != NULL, ("ctrl is NULL"));
 
 	ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max);
 	softc->total_cmd++;
 
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n",
 	    ctrl->cmd_tail,
 	    ctrl->cmd_head);
 #endif
 
 }
 
 /*
  * Various commands supported by IOMMU.
  */
 
 /* Completion wait command. */
 static void
 amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data)
 {
 	struct amdvi_cmd *cmd;
 	uint64_t pa;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 	pa = vtophys(&softc->cmp_data);
 	cmd->opcode = AMDVI_CMP_WAIT_OPCODE;
 	cmd->word0 = (pa & 0xFFFFFFF8) |
 	    (AMDVI_CMP_WAIT_STORE);
 	//(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE);
 	cmd->word1 = (pa >> 32) & 0xFFFFF;
 	cmd->addr = data;
 
 	amdvi_update_cmd_tail(softc);
 }
 
 /* Invalidate device table entry. */
 static void
 amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 	cmd->opcode = AMDVI_INVD_DTE_OPCODE;
 	cmd->word0 = devid;
 	amdvi_update_cmd_tail(softc);
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid);
 #endif
 }
 
 /* Invalidate IOMMU page, use for invalidation of domain. */
 static void
 amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id,
 			  uint64_t addr, bool guest_nested,
 			  bool pde, bool page)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 
 	cmd->opcode = AMDVI_INVD_PAGE_OPCODE;
 	cmd->word1 = domain_id;
 	/*
 	 * Invalidate all addresses for this domain.
 	 */
 	cmd->addr = addr;
 	cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0;
 	cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0;
 
 	amdvi_update_cmd_tail(softc);
 }
 
 #ifdef AMDVI_ATS_ENABLE
 /* Invalidate device IOTLB. */
 static void
 amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid)
 {
 	struct amdvi_cmd *cmd;
 	int qlen;
 
 	if (!softc->iotlb)
 		return;
 
 	qlen = amdvi_find_ats_qlen(devid);
 	if (qlen < 0) {
 		panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n",
 		      qlen, RID2PCI_STR(devid));
 	}
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidate IOTLB devID 0x%x"
 		      " Qlen:%d\n", devid, qlen);
 #endif
 	cmd->opcode = AMDVI_INVD_IOTLB_OPCODE;
 	cmd->word0 = devid;
 	cmd->word1 = qlen;
 	cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR |
 		AMDVI_INVD_IOTLB_S;
 	amdvi_update_cmd_tail(softc);
 }
 #endif
 
 #ifdef notyet				/* For Interrupt Remap. */
 static void
 amdvi_cmd_inv_intr_map(struct amdvi_softc *softc,
 		       uint16_t devid)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 	cmd->opcode = AMDVI_INVD_INTR_OPCODE;
 	cmd->word0 = devid;
 	amdvi_update_cmd_tail(softc);
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid);
 #endif
 }
 #endif
 
 /* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */
 static void
 amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 	/*
 	 * See section 3.3.3 of IOMMU spec rev 2.0, software note
 	 * for invalidating domain.
 	 */
 	amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR,
 				false, true, true);
 
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id);
 
 #endif
 }
 
 static	bool
 amdvi_cmp_wait(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	const uint64_t VERIFY = 0xA5A5;
 	volatile uint64_t *read;
 	int i;
 	bool status;
 
 	ctrl = softc->ctrl;
 	read = &softc->cmp_data;
 	*read = 0;
 	amdvi_cmd_cmp(softc, VERIFY);
 	/* Wait for h/w to update completion data. */
 	for (i = 0; i < 100 && (*read != VERIFY); i++) {
 		DELAY(1000);		/* 1 ms */
 	}
 	status = (VERIFY == softc->cmp_data) ? true : false;
 
 #ifdef AMDVI_DEBUG_CMD
 	if (status)
 		device_printf(softc->dev, "CMD completion DONE Tail:0x%x, "
 			      "Head:0x%x, loop:%d.\n", ctrl->cmd_tail,
 			      ctrl->cmd_head, loop);
 #endif
 	return (status);
 }
 
 static void
 amdvi_wait(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	int i;
 
 	KASSERT(softc, ("softc is NULL"));
 
 	ctrl = softc->ctrl;
 	KASSERT(ctrl != NULL, ("ctrl is NULL"));
 	/* Don't wait if h/w is not enabled. */
 	if ((ctrl->control & AMDVI_CTRL_EN) == 0)
 		return;
 
 	for (i = 0; i < 10; i++) {
 		if (amdvi_cmp_wait(softc))
 			return;
 	}
 
 	device_printf(softc->dev, "Error: completion failed"
 		      " tail:0x%x, head:0x%x.\n",
 		      ctrl->cmd_tail, ctrl->cmd_head);
 	amdvi_dump_cmds(softc);
 }
 
 static void
 amdvi_dump_cmds(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_cmd *cmd;
 	int off, i;
 
 	ctrl = softc->ctrl;
 	device_printf(softc->dev, "Dump all the commands:\n");
 	/*
 	 * If h/w is stuck in completion, it is the previous command,
 	 * start dumping from previous command onward.
 	 */
 	off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd),
 	    softc->cmd_max);
 	for (i = 0; off != ctrl->cmd_tail &&
 	    i < softc->cmd_max; i++) {
 		cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off);
 		printf("  [CMD%d, off:0x%x] opcode= 0x%x 0x%x"
 		    " 0x%x 0x%lx\n", i, off, cmd->opcode,
 		    cmd->word0, cmd->word1, cmd->addr);
 		off = (off + sizeof(struct amdvi_cmd)) %
 		    (softc->cmd_max * sizeof(struct amdvi_cmd));
 	}
 }
 
 static int
 amdvi_init_event(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 
 	ctrl = softc->ctrl;
 	ctrl->event.len = 8;
 	softc->event_max = 1 << ctrl->event.len;
 	softc->event = malloc(sizeof(struct amdvi_event) *
 	    softc->event_max, M_AMDVI, M_WAITOK | M_ZERO);
 	if ((uintptr_t)softc->event & PAGE_MASK) {
 		device_printf(softc->dev, "Event buffer not aligned on page.");
 		return (false);
 	}
 	ctrl->event.base = vtophys(softc->event) / PAGE_SIZE;
 
 	/* Reset the pointers. */
 	ctrl->evt_head = 0;
 	ctrl->evt_tail = 0;
 
 	return (0);
 }
 
 static inline void
 amdvi_decode_evt_flag(uint16_t flag)
 {
 
 	flag &= AMDVI_EVENT_FLAG_MASK;
-	printf("0x%b]\n", flag,
+	printf(" 0x%b]\n", flag,
 		"\020"
 		"\001GN"
 		"\002NX"
 		"\003US"
 		"\004I"
 		"\005PR"
 		"\006RW"
 		"\007PE"
 		"\010RZ"
 		"\011TR"
 		);
 }
 
 /* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/
 static inline void
 amdvi_decode_evt_flag_type(uint8_t type)
 {
 
 	switch (AMDVI_EVENT_FLAG_TYPE(type)) {
 	case 0:
 		printf("RSVD\n");
 		break;
 	case 1:
 		printf("Master Abort\n");
 		break;
 	case 2:
 		printf("Target Abort\n");
 		break;
 	case 3:
 		printf("Data Err\n");
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr,
     uint16_t flag)
 {
 
 	printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx",
 	    devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 }
 
 static void
 amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr,
     uint16_t flag)
 {
 
 	printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx",
 	    devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 }
 
 static void
 amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid,
     uint64_t addr, uint16_t flag)
 {
 
 	printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx", devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 	amdvi_decode_evt_flag_type(flag);
 }
 
 static void
 amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr,
     uint16_t flag)
 {
 
 	printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx", devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 	amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag));
 }
 
 static void
 amdvi_decode_evt(struct amdvi_event *evt)
 {
 	struct amdvi_cmd *cmd;
 
 	switch (evt->opcode) {
 	case AMDVI_EVENT_INVALID_DTE:
 		amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_PFAULT:
 		amdvi_decode_pf_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_DTE_HW_ERROR:
 		amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_PAGE_HW_ERROR:
 		amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_ILLEGAL_CMD:
 		/* FALL THROUGH */
 	case AMDVI_EVENT_CMD_HW_ERROR:
-		printf("\t[%s EVT]", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ?
+		printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ?
 		    "ILLEGAL CMD" : "CMD HW ERR");
 		cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr);
 		printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n",
 		    cmd->opcode, cmd->word0, cmd->word1, cmd->addr);
 		break;
 
 	case AMDVI_EVENT_IOTLB_TIMEOUT:
-		printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx",
+		printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n",
 		    evt->devid, evt->addr);
 		break;
 
 	case AMDVI_EVENT_INVALID_DTE_REQ:
-		printf("\t[INV_DTE devid:0x%x addr:0x%lx",
-		    evt->devid, evt->addr);
+		printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n",
+		    evt->devid, evt->addr, evt->flag >> 9,
+		    (evt->flag >> 8) & 1);
 		break;
 
 	case AMDVI_EVENT_INVALID_PPR_REQ:
 	case AMDVI_EVENT_COUNTER_ZERO:
 		printf("AMD-Vi: v2 events.\n");
 		break;
 
 	default:
-		printf("Unsupported AMD-Vi event:%d", evt->opcode);
+		printf("Unsupported AMD-Vi event:%d\n", evt->opcode);
 	}
 }
 
 static void
 amdvi_print_events(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_event *event;
 	int i, size;
 
 	ctrl = softc->ctrl;
 	size = sizeof(struct amdvi_event);
 	for (i = 0; i < softc->event_max; i++) {
 		event = &softc->event[ctrl->evt_head / size];
 		if (!event->opcode)
 			break;
 		device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n",
 		    i, ctrl->evt_head, ctrl->evt_tail);
 		amdvi_decode_evt(event);
 		ctrl->evt_head = MOD_INC(ctrl->evt_head, size,
 		    softc->event_max);
 	}
 }
 
 static int
 amdvi_init_dte(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 
 	ctrl = softc->ctrl;
 	ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE;
 	ctrl->dte.size = 0x1FF;		/* 2MB device table. */
 
 	return (0);
 }
 
 /*
  * Not all capabilities of IOMMU are available in ACPI IVHD flag
  * or EFR entry, read directly from device.
  */
 static int
 amdvi_print_pci_cap(device_t dev)
 {
 	struct amdvi_softc *softc;
 	uint32_t off, cap;
 
 
 	softc = device_get_softc(dev);
 	off = softc->cap_off;
 
 	/*
 	 * Section 3.7.1 of IOMMU sepc rev 2.0.
 	 * Read capability from device.
 	 */
 	cap = amdvi_pci_read(softc, off);
 
 	/* Make sure capability type[18:16] is 3. */
 	KASSERT((((cap >> 16) & 0x7) == 0x3),
 	    ("Not a IOMMU capability 0x%x@0x%x", cap, off));
 
 	softc->pci_cap = cap >> 24;
 	device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n",
 	    cap, off, softc->pci_cap,
 	    "\020\001IOTLB\002HT\003NPCache\004EFR");
 
 	/* IOMMU spec Rev 2.0, section 3.7.2.1 */
 	softc->pci_efr = softc->ctrl->ex_feature;
 	if (softc->pci_efr) {
 		device_printf(softc->dev, "PCI extended Feature:%b\n",
 		    (int)softc->pci_efr,
 		    "\020\001PreFSup\002PPRSup\003XTSup\004NXSup\006IASup"
 		    "\007GASup\008HESup\009PCSup");
 		device_printf(softc->dev,
 		    "PCI HATS = %d GATS = %d GLXSup = %d, max PASID: 0x%x ",
 		    (int)((softc->pci_efr >> 10) & 0x3),
 		    (int)((softc->pci_efr >> 12) & 0x3),
 		    (int)((softc->pci_efr >> 14) & 0x3),
 		    (int)((softc->pci_efr >> 32) & 0x1F) + 1);
 	}
 
 	return (0);
 }
 
 static void
 amdvi_event_intr(void *arg)
 {
 	struct amdvi_softc *softc;
 	struct amdvi_ctrl *ctrl;
 
 	softc = (struct amdvi_softc *)arg;
 	ctrl = softc->ctrl;
 	device_printf(softc->dev, "EVT INTR %ld Status:0x%x"
 	    " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++,
 	    ctrl->status, ctrl->evt_head, ctrl->evt_tail);
 	printf("  [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n",
 	    softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head);
 
 	amdvi_print_events(softc);
 }
 
 static void
 amdvi_free_evt_intr_res(device_t dev)
 {
 
 	struct amdvi_softc *softc;
 
 	softc = device_get_softc(dev);
 	if (softc->event_tag != NULL) {
 		bus_teardown_intr(dev, softc->event_res, softc->event_tag);
 	}
 	if (softc->event_res != NULL) {
 		bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid,
 		    softc->event_res);
 	}
 	bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid);
 	PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)),
 	    dev, 1, &softc->event_irq);
 }
 
 static	bool
 amdvi_alloc_intr_resources(struct amdvi_softc *softc)
 {
 	device_t dev, pcib;
 	uint64_t msi_addr;
 	uint32_t msi_data, temp;
 	int err, msi_off;
 
 	dev = softc->dev;
 	pcib = device_get_parent(device_get_parent(dev));
 	softc->event_irq = -1;
 	softc->event_rid = 0;
 	/*
 	 * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one
 	 * interrupt. XXX: Enable MSI/X support.
 	 */
 
 	err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq);
 	if (err) {
 		device_printf(dev,
 		    "Couldn't find event MSI IRQ resource.\n");
 		return (ENOENT);
 	}
 	err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid,
 	    softc->event_irq, 1);
 	if (err) {
 		device_printf(dev, "Couldn't set event MSI resource.\n");
 		return (ENXIO);
 	}
 	softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
 	    &softc->event_rid, RF_ACTIVE);
 	if (!softc->event_res) {
 		device_printf(dev,
 		    "Unable to allocate event INTR resource.\n");
 		return (ENOMEM);
 	}
 
 	if (bus_setup_intr(dev, softc->event_res,
 	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr,
 	    softc, &softc->event_tag)) {
 		device_printf(dev, "Fail to setup event intr\n");
 		bus_release_resource(softc->dev, SYS_RES_IRQ,
 		    softc->event_rid, softc->event_res);
 		softc->event_res = NULL;
 		return (ENXIO);
 	}
 
 	bus_describe_intr(dev, softc->event_res, softc->event_tag,
 	    "fault");
 
 	err = amdvi_find_pci_cap(softc, PCIY_MSI, &msi_off);
 	if (err) {
 		device_printf(dev, "Couldn't find MSI capability, err = %d.\n",
 			      err);
 		return (err);
 	}
 
 	err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr,
 	    &msi_data);
 	if (err) {
 		device_printf(dev,
 		    "Event interrupt config failed, err=%d.\n",
 		    err);
 		amdvi_free_evt_intr_res(softc->dev);
 		return (err);
 	}
 
 	/* Configure MSI */
 	amdvi_pci_write(softc, msi_off + PCIR_MSI_ADDR, msi_addr);
 	amdvi_pci_write(softc, msi_off + PCIR_MSI_ADDR_HIGH,
 	    msi_addr >> 32);
 	amdvi_pci_write(softc, msi_off + PCIR_MSI_DATA_64BIT, msi_data);
 
 	/* Now enable MSI interrupt. */
 	temp = amdvi_pci_read(softc, msi_off);
 	temp |= (PCIM_MSICTRL_MSI_ENABLE << 16);	/* MSI enable. */
 	amdvi_pci_write(softc, msi_off, temp);
 
 	return (0);
 }
 
 
 static void
 amdvi_print_dev_cap(struct amdvi_softc *softc)
 {
 	struct ivhd_dev_cfg *cfg;
 	int i;
 
 	cfg = softc->dev_cfg;
 	for (i = 0; i < softc->dev_cfg_cnt; i++) {
 		device_printf(softc->dev, "device [0x%x - 0x%x]"
 		    "config:%b%s\n", cfg->start_id, cfg->end_id,
 		    cfg->data,
 		    "\020\001INIT\002ExtInt\003NMI"
 		    "\007LINT0\008LINT1",
 		    cfg->enable_ats ? "ATS enabled" : "");
 		cfg++;
 	}
 }
 
 static int
 amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct amdvi_softc *softc;
 	int result, type, error = 0;
 
 	softc = (struct amdvi_softc *)arg1;
 	type = arg2;
 
 	switch (type) {
 	case 0:
 		result = softc->ctrl->cmd_head;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 	case 1:
 		result = softc->ctrl->cmd_tail;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 	case 2:
 		result = softc->ctrl->evt_head;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 	case 3:
 		result = softc->ctrl->evt_tail;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 
 	default:
 		device_printf(softc->dev, "Unknown sysctl:%d\n", type);
 	}
 
 	return (error);
 }
 
 static void
 amdvi_add_sysctl(struct amdvi_softc *softc)
 {
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
 	device_t dev;
 
 	dev = softc->dev;
 	ctx = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 
 	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD,
 	    &softc->event_intr_cnt, "Event interrupt count");
 	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD,
 	    &softc->total_cmd, "Command submitted count");
-	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD,
-	    (int *)&softc->pci_rid, 0,
-	    "IOMMU RID");
-	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD,
-	    (int *)&softc->start_dev_rid, 0,
-	    "Start of device under this IOMMU");
-	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD,
-	    (int *)&softc->end_dev_rid, 0,
-	    "End of device under this IOMMU");
+	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD,
+	    &softc->pci_rid, 0, "IOMMU RID");
+	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD,
+	    &softc->start_dev_rid, 0, "Start of device under this IOMMU");
+	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD,
+	    &softc->end_dev_rid, 0, "End of device under this IOMMU");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head",
 	    CTLTYPE_UINT | CTLFLAG_RD, softc, 0,
 	    amdvi_handle_sysctl, "IU", "Command head");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail",
 	    CTLTYPE_UINT | CTLFLAG_RD, softc, 1,
 	    amdvi_handle_sysctl, "IU", "Command tail");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head",
 	    CTLTYPE_UINT | CTLFLAG_RD, softc, 2,
 	    amdvi_handle_sysctl, "IU", "Command head");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail",
 	    CTLTYPE_UINT | CTLFLAG_RD, softc, 3,
 	    amdvi_handle_sysctl, "IU", "Command tail");
 }
 
 int
 amdvi_setup_hw(struct amdvi_softc *softc)
 {
 	device_t dev;
 	int status;
 
 	dev = softc->dev;
 
 	amdvi_hw_enable_iotlb(softc);
 
 	amdvi_print_dev_cap(softc);
 
 	if ((status = amdvi_print_pci_cap(dev)) != 0) {
 		device_printf(dev, "PCI capability.\n");
 		return (status);
 	}
 	if ((status = amdvi_init_cmd(softc)) != 0) {
 		device_printf(dev, "Couldn't configure command buffer.\n");
 		return (status);
 	}
 	if ((status = amdvi_init_event(softc)) != 0) {
 		device_printf(dev, "Couldn't configure event buffer.\n");
 		return (status);
 	}
 	if ((status = amdvi_init_dte(softc)) != 0) {
 		device_printf(dev, "Couldn't configure device table.\n");
 		return (status);
 	}
 	if ((status = amdvi_alloc_intr_resources(softc)) != 0) {
 		return (status);
 	}
 	amdvi_add_sysctl(softc);
 	return (0);
 }
 
 int
 amdvi_teardown_hw(struct amdvi_softc *softc)
 {
 	device_t dev;
 
 	dev = softc->dev;
 
 	/* 
 	 * Called after disable, h/w is stopped by now, free all the resources. 
 	 */
 	amdvi_free_evt_intr_res(dev);
 
 	if (softc->cmd)
 		free(softc->cmd, M_AMDVI);
 
 	if (softc->event)
 		free(softc->event, M_AMDVI);
 
 	return (0);
 }
 
 /*********** bhyve interfaces *********************/
 static int
 amdvi_init(void)
 {
 	if (!ivhd_count) {
 		return (EIO);
 	}
 	if (!amdvi_enable_user && ivhd_count) {
 		printf("bhyve: Found %d AMD-Vi/IOMMU device(s), "
 		    	"use hw.vmm.amdvi_enable=1 to enable pass-through.\n",
 		    ivhd_count);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static void
 amdvi_cleanup(void)
 {
 	/* Nothing. */
 }
 
 static uint16_t
 amdvi_domainId(void)
 {
 
 	/*
 	 * If we hit maximum domain limit, rollover leaving host
 	 * domain(0).
 	 * XXX: make sure that this domain is not used.
 	 */
 	if (amdvi_dom_id == AMDVI_MAX_DOMAIN)
 		amdvi_dom_id = 1;
 
 	return ((uint16_t)amdvi_dom_id++);
 }
 
 static void
 amdvi_do_inv_domain(uint16_t domain_id, bool create)
 {
 	struct amdvi_softc *softc;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		KASSERT(softc, ("softc is NULL"));
 		/*
 		 * If not present pages are cached, invalidate page after
 		 * creating domain.
 		 */
 #if 0
 		if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0))
 			continue;
 #endif
 		amdvi_inv_domain(softc, domain_id);
 		amdvi_wait(softc);
 	}
 }
 
 static void *
 amdvi_create_domain(vm_paddr_t maxaddr)
 {
 	struct amdvi_domain *dom;
 
 	dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK);
 	dom->id = amdvi_domainId();
 	//dom->maxaddr = maxaddr;
 #ifdef AMDVI_DEBUG_CMD
 	printf("Created domain #%d\n", dom->id);
 #endif
 	/*
 	 * Host domain(#0) don't create translation table.
 	 */
 	if (dom->id || amdvi_host_ptp)
 		dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO);
 
 	dom->ptp_level = amdvi_ptp_level;
 
 	amdvi_do_inv_domain(dom->id, true);
 	SLIST_INSERT_HEAD(&dom_head, dom, next);
 
 	return (dom);
 }
 
 static void
 amdvi_free_ptp(uint64_t *ptp, int level)
 {
 	int i;
 
 	if (level < 1)
 		return;
 
 	for (i = 0; i < NPTEPG ; i++) {
 		if ((ptp[i] & AMDVI_PT_PRESENT) == 0)
 			continue;
 		/* XXX: Add super-page or PTE mapping > 4KB. */
 #ifdef notyet
 		/* Super-page mapping. */
 		if (AMDVI_PD_SUPER(ptp[i]))
 			continue;
 #endif
 
 		amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i]
 		    & AMDVI_PT_MASK), level - 1);
 
 	}
 
 	free(ptp, M_AMDVI);
 }
 
 static void
 amdvi_destroy_domain(void *arg)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	KASSERT(domain, ("domain is NULL"));
 #ifdef AMDVI_DEBUG_CMD
 	printf("Destroying domain %d\n", domain->id);
 #endif
 	if (domain->ptp)
 		amdvi_free_ptp(domain->ptp, domain->ptp_level);
 
 	amdvi_do_inv_domain(domain->id, false);
 	SLIST_REMOVE(&dom_head, domain, amdvi_domain, next);
 	free(domain, M_AMDVI);
 }
 
 static uint64_t
 amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa,
     vm_paddr_t hpa, uint64_t pg_size, bool create)
 {
 	uint64_t *page, pa;
 	int shift, index;
 	const int PT_SHIFT = 9;
 	const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1;	/* Based on PT_SHIFT */
 
 	if (!pg_size)
 		return (0);
 
 	if (hpa & (pg_size - 1)) {
 		printf("HPA is not size aligned.\n");
 		return (0);
 	}
 	if (gpa & (pg_size - 1)) {
 		printf("HPA is not size aligned.\n");
 		return (0);
 	}
 	shift = PML4SHIFT;
 	while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) {
 		index = (gpa >> shift) & PT_INDEX_MASK;
 
 		if ((pt[index] == 0) && create) {
 			page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO);
 			pa = vtophys(page);
 			pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW |
 			    ((level - 1) << AMDVI_PD_LEVEL_SHIFT);
 		}
 #ifdef AMDVI_DEBUG_PTE
 		if ((gpa % 0x1000000) == 0)
 			printf("[level%d, shift = %d]PTE:0x%lx\n",
 			    level, shift, pt[index]);
 #endif
 #define PTE2PA(x)	((uint64_t)(x) & AMDVI_PT_MASK)
 		pa = PTE2PA(pt[index]);
 		pt = (uint64_t *)PHYS_TO_DMAP(pa);
 		shift -= PT_SHIFT;
 		level--;
 	}
 
 	/* Leaf entry. */
 	index = (gpa >> shift) & PT_INDEX_MASK;
 
 	if (create) {
 		pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT;
 	} else
 		pt[index] = 0;
 
 #ifdef AMDVI_DEBUG_PTE
 	if ((gpa % 0x1000000) == 0)
 		printf("[Last level%d, shift = %d]PTE:0x%lx\n",
 		    level, shift, pt[index]);
 #endif
 	return (1ULL << shift);
 }
 
 static uint64_t
 amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa,
     vm_paddr_t hpa, uint64_t size, bool create)
 {
 	uint64_t mapped, *ptp, len;
 	int level;
 
 	KASSERT(domain, ("domain is NULL"));
 	level = domain->ptp_level;
 	KASSERT(level, ("Page table level is 0"));
 
 	ptp = domain->ptp;
 	KASSERT(ptp, ("PTP is NULL"));
 	mapped = 0;
 	while (mapped < size) {
 		len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped,
 		    PAGE_SIZE, create);
 		if (!len) {
 			printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n",
 			    hpa, gpa);
 			return (0);
 		}
 		mapped += len;
 	}
 
 	return (mapped);
 }
 
 static uint64_t
 amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa,
     uint64_t len)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 
 	if (domain->id && !domain->ptp) {
 		printf("ptp is NULL");
 		return (-1);
 	}
 
 	/*
 	 * If host domain is created w/o page table, skip IOMMU page
 	 * table set-up.
 	 */
 	if (domain->ptp)
 		return (amdvi_update_mapping(domain, gpa, hpa, len, true));
 	else
 		return (len);
 }
 
 static uint64_t
 amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	/*
 	 * If host domain is created w/o page table, skip IOMMU page
 	 * table set-up.
 	 */
 	if (domain->ptp)
 		return (amdvi_update_mapping(domain, gpa, 0, len, false));
 	return
 	    (len);
 }
 
 static struct amdvi_softc *
 amdvi_find_iommu(uint16_t devid)
 {
 	struct amdvi_softc *softc;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		if ((devid >= softc->start_dev_rid) &&
 		    (devid <= softc->end_dev_rid))
 			return (softc);
 	}
 
 	/*
 	 * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU.
 	 */
 	printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n",
 	    RID2PCI_STR(devid));
 
 	return (device_get_softc(ivhd_devs[0]));
 }
 
 /*
  * Set-up device table entry.
  * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must
  * be set concurrently, e.g. read and write bits.
  */
 static void
 amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable)
 {
 	struct amdvi_softc *softc;
 	struct amdvi_dte temp;
 
 	softc = amdvi_find_iommu(devid);
 	KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid));
 
 	memset(&temp, 0, sizeof(struct amdvi_dte));
 
 #ifdef AMDVI_ATS_ENABLE
 	/* If IOMMU and device support IOTLB, enable it. */
 	if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb)
 		temp.iotlb_enable = 1;
 #endif
 
 	/* Avoid duplicate I/O faults. */
 	temp.sup_second_io_fault = 1;
 	temp.sup_all_io_fault = amdvi_disable_io_fault;
 
 	temp.dt_valid = 1;
 	temp.domain_id = domain->id;
 
 	if (enable) {
 		if (domain->ptp) {
 			temp.pt_base = vtophys(domain->ptp) >> 12;
 			temp.pt_level = amdvi_ptp_level;
 		}
 		/*
 		 * XXX: Page table valid[TV] bit must be set even if host domain
 		 * page tables are not enabled.
 		 */
 		temp.pt_valid = 1;
 		temp.read_allow = 1;
 		temp.write_allow = 1;
 	}
 	amdvi_dte[devid] = temp;
 }
 
 static void
 amdvi_inv_device(uint16_t devid)
 {
 	struct amdvi_softc *softc;
 
 	softc = amdvi_find_iommu(devid);
 	KASSERT(softc, ("softc is NULL"));
 
 	amdvi_cmd_inv_dte(softc, devid);
 #ifdef AMDVI_ATS_ENABLE
 	if (amdvi_dev_support_iotlb(softc, devid))
 		amdvi_cmd_inv_iotlb(softc, devid);
 #endif
 	amdvi_wait(softc);
 }
 
 static void
 amdvi_add_device(void *arg, uint16_t devid)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	KASSERT(domain != NULL, ("domain is NULL"));
 #ifdef AMDVI_DEBUG_CMD
 	printf("Assigning device(%d.%d.%d) to domain:%d\n",
 	    RID2PCI_STR(devid), domain->id);
 #endif
 	amdvi_set_dte(domain, devid, true);
 	amdvi_inv_device(devid);
 }
 
 static void
 amdvi_remove_device(void *arg, uint16_t devid)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 #ifdef AMDVI_DEBUG_CMD
 	printf("Remove device(0x%x) from domain:%d\n",
 	       devid, domain->id);
 #endif
 	amdvi_set_dte(domain, devid, false);
 	amdvi_inv_device(devid);
 }
 
 static void
 amdvi_enable(void)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_softc *softc;
 	uint64_t val;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		KASSERT(softc, ("softc is NULL\n"));
 		ctrl = softc->ctrl;
 		KASSERT(ctrl, ("ctrl is NULL\n"));
 
 		val = (	AMDVI_CTRL_EN 		|
 			AMDVI_CTRL_CMD 		|
 		    	AMDVI_CTRL_ELOG 	|
 		    	AMDVI_CTRL_ELOGINT 	|
 		    	AMDVI_CTRL_INV_TO_1S);
 
 		if (softc->ivhd_flag & IVHD_FLAG_COH)
 			val |= AMDVI_CTRL_COH;
 		if (softc->ivhd_flag & IVHD_FLAG_HTT)
 			val |= AMDVI_CTRL_HTT;
 		if (softc->ivhd_flag & IVHD_FLAG_RPPW)
 			val |= AMDVI_CTRL_RPPW;
 		if (softc->ivhd_flag & IVHD_FLAG_PPW)
 			val |= AMDVI_CTRL_PPW;
 		if (softc->ivhd_flag & IVHD_FLAG_ISOC)
 			val |= AMDVI_CTRL_ISOC;
 
 		ctrl->control = val;
 	}
 }
 
 static void
 amdvi_disable(void)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_softc *softc;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		KASSERT(softc, ("softc is NULL\n"));
 		ctrl = softc->ctrl;
 		KASSERT(ctrl, ("ctrl is NULL\n"));
 
 		ctrl->control = 0;
 	}
 }
 
 static void
 amdvi_inv_tlb(void *arg)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	KASSERT(domain, ("domain is NULL"));
 	amdvi_do_inv_domain(domain->id, false);
 }
 
 struct iommu_ops iommu_ops_amd = {
 	amdvi_init,
 	amdvi_cleanup,
 	amdvi_enable,
 	amdvi_disable,
 	amdvi_create_domain,
 	amdvi_destroy_domain,
 	amdvi_create_mapping,
 	amdvi_destroy_mapping,
 	amdvi_add_device,
 	amdvi_remove_device,
 	amdvi_inv_tlb
 };
Index: projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_priv.h
===================================================================
--- projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_priv.h	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/amd64/vmm/amd/amdvi_priv.h	(revision 326162)
@@ -1,395 +1,395 @@
 /*-
  * Copyright (c) 2016 Anish Gupta (anish@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _AMDVI_PRIV_H_
 #define _AMDVI_PRIV_H_
 
 #define	BIT(n)			(1ULL << (n))
 /* Return value of bits[n:m] where n and (n >= ) m are bit positions. */
 #define REG_BITS(x, n, m)	(((x) >> (m)) & 		\
 				((1 << (((n) - (m)) + 1)) - 1))
 
 /*
  * IOMMU PCI capability.
  */
 #define AMDVI_PCI_CAP_IOTLB	BIT(0)	/* IOTLB is supported. */
 #define AMDVI_PCI_CAP_HT	BIT(1)	/* HyperTransport tunnel support. */
 #define AMDVI_PCI_CAP_NPCACHE	BIT(2)	/* Not present page cached. */
 #define AMDVI_PCI_CAP_EFR	BIT(3)	/* Extended features. */
 #define AMDVI_PCI_CAP_EXT	BIT(4)	/* Miscellaneous information reg. */
 
 /*
  * IOMMU extended features.
  */
 #define AMDVI_EX_FEA_PREFSUP	BIT(0)	/* Prefetch command support. */
 #define AMDVI_EX_FEA_PPRSUP	BIT(1)	/* PPR support */
 #define AMDVI_EX_FEA_XTSUP	BIT(2)	/* Reserved */
 #define AMDVI_EX_FEA_NXSUP	BIT(3)	/* No-execute. */
 #define AMDVI_EX_FEA_GTSUP	BIT(4)	/* Guest translation support. */
 #define AMDVI_EX_FEA_EFRW	BIT(5)	/* Reserved */
 #define AMDVI_EX_FEA_IASUP	BIT(6)	/* Invalidate all command supp. */
 #define AMDVI_EX_FEA_GASUP	BIT(7)	/* Guest APIC or AVIC support. */
 #define AMDVI_EX_FEA_HESUP	BIT(8)	/* Hardware Error. */
 #define AMDVI_EX_FEA_PCSUP	BIT(9)	/* Performance counters support. */
 /* XXX: add more EFER bits. */
 
 /*
  * Device table entry or DTE
  * NOTE: Must be 256-bits/32 bytes aligned.
  */
 struct amdvi_dte {
 	uint32_t dt_valid:1;		/* Device Table valid. */
 	uint32_t pt_valid:1;		/* Page translation valid. */
 	uint16_t :7;			/* Reserved[8:2] */
 	uint8_t	 pt_level:3;		/* Paging level, 0 to disable. */
 	uint64_t pt_base:40;		/* Page table root pointer. */
 	uint8_t  :3;			/* Reserved[54:52] */
 	uint8_t	 gv_valid:1;		/* Revision 2, GVA to SPA. */
 	uint8_t	 gv_level:2;		/* Revision 2, GLX level. */
 	uint8_t	 gv_cr3_lsb:3;		/* Revision 2, GCR3[14:12] */
 	uint8_t	 read_allow:1;		/* I/O read enabled. */
 	uint8_t	 write_allow:1;		/* I/O write enabled. */
 	uint8_t  :1;			/* Reserved[63] */
 	uint16_t domain_id:16;		/* Domain ID */
 	uint16_t gv_cr3_lsb2:16;	/* Revision 2, GCR3[30:15] */
 	uint8_t	 iotlb_enable:1;	/* Device support IOTLB */
 	uint8_t	 sup_second_io_fault:1;	/* Suppress subsequent I/O faults. */
 	uint8_t	 sup_all_io_fault:1;	/* Suppress all I/O page faults. */
 	uint8_t	 IOctl:2;		/* Port I/O control. */
 	uint8_t	 iotlb_cache_disable:1;	/* IOTLB cache hints. */
 	uint8_t	 snoop_disable:1;	/* Snoop disable. */
 	uint8_t	 allow_ex:1;		/* Allow exclusion. */
 	uint8_t	 sysmgmt:2;		/* System management message.*/
 	uint8_t  :1;			/* Reserved[106] */
 	uint32_t gv_cr3_msb:21;		/* Revision 2, GCR3[51:31] */
 	uint8_t	 intmap_valid:1;	/* Interrupt map valid. */
 	uint8_t	 intmap_len:4;		/* Interrupt map table length. */
 	uint8_t	 intmap_ign:1;		/* Ignore unmapped interrupts. */
 	uint64_t intmap_base:46;	/* IntMap base. */
 	uint8_t  :4;			/* Reserved[183:180] */
 	uint8_t	 init_pass:1;		/* INIT pass through or PT */
 	uint8_t	 extintr_pass:1;	/* External Interrupt PT */
 	uint8_t	 nmi_pass:1;		/* NMI PT */
 	uint8_t  :1;			/* Reserved[187] */
 	uint8_t	 intr_ctrl:2;		/* Interrupt control */
 	uint8_t	 lint0_pass:1;		/* LINT0 PT */
 	uint8_t	 lint1_pass:1;		/* LINT1 PT */
 	uint64_t :64;			/* Reserved[255:192] */
 } __attribute__((__packed__));
 CTASSERT(sizeof(struct amdvi_dte) == 32);
 
 /*
  * IOMMU command entry.
  */
 struct amdvi_cmd {
 	uint32_t 	word0;
 	uint32_t 	word1:28;
 	uint8_t		opcode:4;
 	uint64_t 	addr;
 } __attribute__((__packed__));
 
 /* Command opcodes. */
 #define AMDVI_CMP_WAIT_OPCODE	0x1	/* Completion wait. */
 #define AMDVI_INVD_DTE_OPCODE	0x2	/* Invalidate device table entry. */
 #define AMDVI_INVD_PAGE_OPCODE	0x3	/* Invalidate pages. */
 #define AMDVI_INVD_IOTLB_OPCODE	0x4	/* Invalidate IOTLB pages. */
 #define AMDVI_INVD_INTR_OPCODE	0x5	/* Invalidate Interrupt table. */
 #define AMDVI_PREFETCH_PAGES_OPCODE	0x6	/* Prefetch IOMMU pages. */
 #define AMDVI_COMP_PPR_OPCODE	0x7	/* Complete PPR request. */
 #define AMDVI_INV_ALL_OPCODE	0x8	/* Invalidate all. */
 
 /* Completion wait attributes. */
 #define AMDVI_CMP_WAIT_STORE	BIT(0)	/* Write back data. */
 #define AMDVI_CMP_WAIT_INTR	BIT(1)	/* Completion wait interrupt. */
 #define AMDVI_CMP_WAIT_FLUSH	BIT(2)	/* Flush queue. */
 
 /* Invalidate page. */
 #define AMDVI_INVD_PAGE_S	BIT(0)	/* Invalidation size. */
 #define AMDVI_INVD_PAGE_PDE	BIT(1)	/* Invalidate PDE. */
 #define AMDVI_INVD_PAGE_GN_GVA	BIT(2)	/* GPA or GVA. */
 
 #define AMDVI_INVD_PAGE_ALL_ADDR	(0x7FFFFFFFFFFFFULL << 12)
 
 /* Invalidate IOTLB. */
 #define AMDVI_INVD_IOTLB_S	BIT(0)	/* Invalidation size 4k or addr */
 #define AMDVI_INVD_IOTLB_GN_GVA	BIT(2)	/* GPA or GVA. */
 
 #define AMDVI_INVD_IOTLB_ALL_ADDR	(0x7FFFFFFFFFFFFULL << 12)
 /* XXX: add more command entries. */
 
 /*
  * IOMMU event entry.
  */
 struct amdvi_event {
 	uint16_t 	devid;
 	uint16_t 	pasid_hi;
 	uint16_t 	pasid_domid;	/* PASID low or DomainID */
 	uint16_t 	flag:12;
 	uint8_t		opcode:4;
 	uint64_t 	addr;
 } __attribute__((__packed__));
 CTASSERT(sizeof(struct amdvi_event) == 16);
 
 /* Various event types. */
 #define AMDVI_EVENT_INVALID_DTE		0x1
 #define AMDVI_EVENT_PFAULT		0x2
 #define AMDVI_EVENT_DTE_HW_ERROR	0x3
 #define AMDVI_EVENT_PAGE_HW_ERROR	0x4
 #define AMDVI_EVENT_ILLEGAL_CMD		0x5
 #define AMDVI_EVENT_CMD_HW_ERROR	0x6
 #define AMDVI_EVENT_IOTLB_TIMEOUT	0x7
 #define AMDVI_EVENT_INVALID_DTE_REQ	0x8
 #define AMDVI_EVENT_INVALID_PPR_REQ	0x9
 #define AMDVI_EVENT_COUNTER_ZERO	0xA
 
 #define AMDVI_EVENT_FLAG_MASK           0x1FF	/* Mask for event flags. */
 #define AMDVI_EVENT_FLAG_TYPE(x)        (((x) >> 9) & 0x3)
 
 /*
  * IOMMU control block.
  */
 struct amdvi_ctrl {
 	struct {
 		uint16_t size:9;
 		uint16_t :3;
 		uint64_t base:40;	/* Devtable register base. */
 		uint16_t :12;
 	} dte;
 	struct {
 		uint16_t :12;
 		uint64_t base:40;
 		uint8_t  :4;
 		uint8_t	 len:4;
 		uint8_t  :4;
 	} cmd;
 	struct {
 		uint16_t :12;
 		uint64_t base:40;
 		uint8_t  :4;
 		uint8_t	 len:4;
 		uint8_t  :4;
 	} event;
 	uint16_t control :13;
 	uint64_t	 :51;
 	struct {
 		uint8_t	 enable:1;
 		uint8_t	 allow:1;
 		uint16_t :10;
 		uint64_t base:40;
 		uint16_t :12;
 		uint16_t :12;
 		uint64_t limit:40;
 		uint16_t :12;
 	} excl;
 	/* 
 	 * Revision 2 only. 
 	 */
 	uint64_t ex_feature;
 	struct {
 		uint16_t :12;
 		uint64_t base:40;
 		uint8_t  :4;
 		uint8_t	 len:4;
 		uint8_t  :4;
 	} ppr;
 	uint64_t first_event;
 	uint64_t second_event;
 	uint64_t event_status;
 	/* Revision 2 only, end. */
 	uint8_t	 pad1[0x1FA8];		/* Padding. */
 	uint32_t cmd_head:19;
 	uint64_t :45;
 	uint32_t cmd_tail:19;
 	uint64_t :45;
 	uint32_t evt_head:19;
 	uint64_t :45;
 	uint32_t evt_tail:19;
 	uint64_t :45;
-	uint64_t :56;
-	uint8_t	 status:8;
+	uint32_t status:19;
+	uint64_t :45;
 	uint64_t pad2;
 	uint8_t  :4;
 	uint16_t ppr_head:15;
 	uint64_t :45;
 	uint8_t  :4;
 	uint16_t ppr_tail:15;
 	uint64_t :45;
 	uint8_t	 pad3[0x1FC0];		/* Padding. */
 
 	/* XXX: More for rev2. */
 } __attribute__((__packed__));
 CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58);
 CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028);
 CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040);
 
 #define AMDVI_MMIO_V1_SIZE	(4 * PAGE_SIZE)	/* v1 size */
 /* 
  * AMF IOMMU v2 size including event counters 
  */
 #define AMDVI_MMIO_V2_SIZE	(8 * PAGE_SIZE)
 
 CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000);
 CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE);
 
 /* IVHD flag */
 #define IVHD_FLAG_HTT		BIT(0)	/* Hypertransport Tunnel. */
 #define IVHD_FLAG_PPW		BIT(1)	/* Pass posted write. */
 #define IVHD_FLAG_RPPW		BIT(2)	/* Response pass posted write. */
 #define IVHD_FLAG_ISOC		BIT(3)	/* Isoc support. */
 #define IVHD_FLAG_IOTLB		BIT(4)	/* IOTLB support. */
 #define IVHD_FLAG_COH		BIT(5)	/* Coherent control, default 1 */
 #define IVHD_FLAG_PFS		BIT(6)	/* Prefetch IOMMU pages. */
 #define IVHD_FLAG_PPRS		BIT(7)	/* Peripheral page support. */
 
 /* IVHD device entry data setting. */
 #define IVHD_DEV_LINT0_PASS	BIT(6)	/* LINT0 interrupts. */
 #define IVHD_DEV_LINT1_PASS	BIT(7)	/* LINT1 interrupts. */
 
 /* Bit[5:4] for System Mgmt. Bit3 is reserved. */
 #define IVHD_DEV_INIT_PASS	BIT(0)	/* INIT */
 #define IVHD_DEV_EXTINTR_PASS	BIT(1)	/* ExtInt */
 #define IVHD_DEV_NMI_PASS	BIT(2)	/* NMI */
 
 /* IVHD 8-byte extended data settings. */
 #define IVHD_DEV_EXT_ATS_DISABLE	BIT(31)	/* Disable ATS */
 
 /* IOMMU control register. */
 #define AMDVI_CTRL_EN		BIT(0)	/* IOMMU enable. */
 #define AMDVI_CTRL_HTT		BIT(1)	/* Hypertransport tunnel enable. */
 #define AMDVI_CTRL_ELOG		BIT(2)	/* Event log enable. */
 #define AMDVI_CTRL_ELOGINT	BIT(3)	/* Event log interrupt. */
 #define AMDVI_CTRL_COMINT	BIT(4)	/* Completion wait interrupt. */
 #define AMDVI_CTRL_PPW		BIT(8)
 #define AMDVI_CTRL_RPPW		BIT(9)
 #define AMDVI_CTRL_COH		BIT(10)
 #define AMDVI_CTRL_ISOC		BIT(11)
 #define AMDVI_CTRL_CMD		BIT(12)	/* Command buffer enable. */
 #define AMDVI_CTRL_PPRLOG	BIT(13)
 #define AMDVI_CTRL_PPRINT	BIT(14)
 #define AMDVI_CTRL_PPREN	BIT(15)
 #define AMDVI_CTRL_GTE		BIT(16)	/* Guest translation enable. */
 #define AMDVI_CTRL_GAE		BIT(17)	/* Guest APIC enable. */
 
 /* Invalidation timeout. */
 #define AMDVI_CTRL_INV_NO_TO	0	/* No timeout. */
 #define AMDVI_CTRL_INV_TO_1ms	1	/* 1 ms */
 #define AMDVI_CTRL_INV_TO_10ms	2	/* 10 ms */
 #define AMDVI_CTRL_INV_TO_100ms	3	/* 100 ms */
 #define AMDVI_CTRL_INV_TO_1S	4	/* 1 second */
 #define AMDVI_CTRL_INV_TO_10S	5	/* 10 second */
 #define AMDVI_CTRL_INV_TO_100S	6	/* 100 second */
 
 /*
  * Max number of PCI devices.
  * 256 bus x 32 slot/devices x 8 functions.
  */
 #define PCI_NUM_DEV_MAX		0x10000
 
 /* Maximum number of domains supported by IOMMU. */
 #define AMDVI_MAX_DOMAIN	(BIT(16) - 1)
 
 /*
  * IOMMU Page Table attributes.
  */
 #define AMDVI_PT_PRESENT	BIT(0)
 #define AMDVI_PT_COHERENT	BIT(60)
 #define AMDVI_PT_READ		BIT(61)
 #define AMDVI_PT_WRITE		BIT(62)
 
 #define AMDVI_PT_RW		(AMDVI_PT_READ | AMDVI_PT_WRITE)
 #define AMDVI_PT_MASK		0xFFFFFFFFFF000UL /* Only [51:12] for PA */
 
 #define AMDVI_PD_LEVEL_SHIFT	9
 #define AMDVI_PD_SUPER(x)	(((x) >> AMDVI_PD_LEVEL_SHIFT) == 7)
 /*
  * IOMMU Status, offset 0x2020
  */
 #define AMDVI_STATUS_EV_OF		BIT(0)	/* Event overflow. */
 #define AMDVI_STATUS_EV_INTR		BIT(1)	/* Event interrupt. */
 /* Completion wait command completed. */
 #define AMDVI_STATUS_CMP		BIT(2)
 
 #define	IVRS_CTRL_RID			1	/* MMIO RID */
 
 /* ACPI IVHD */
 struct ivhd_dev_cfg {
 	uint32_t start_id;
 	uint32_t end_id;
 	uint8_t	 data;			/* Device configuration. */
 	bool	 enable_ats;		/* ATS enabled for the device. */
 	int	 ats_qlen;		/* ATS invalidation queue depth. */
 };
 
 struct amdvi_domain {
 	uint64_t *ptp;			/* Highest level page table */
 	int	ptp_level;		/* Level of page tables */
 	u_int	id;			/* Domain id */
 	SLIST_ENTRY (amdvi_domain) next;
 };
 
 /*
  * AMD IOMMU softc.
  */
 struct amdvi_softc {
 	struct amdvi_ctrl *ctrl;	/* Control area. */
 	device_t 	dev;		/* IOMMU device. */
 	bool		iotlb;		/* IOTLB supported by IOMMU */
 	struct amdvi_cmd *cmd;		/* Command descriptor area. */
 	int 		cmd_max;	/* Max number of commands. */
 	uint64_t	cmp_data;	/* Command completion write back. */
 	struct amdvi_event *event;	/* Event descriptor area. */
 	struct resource *event_res;	/* Event interrupt resource. */
 	void   		*event_tag;	/* Event interrupt tag. */
 	int		event_max;	/* Max number of events. */
 	int		event_irq;
 	int		event_rid;
 	/* ACPI various flags. */
 	uint32_t 	ivhd_flag;	/* ACPI IVHD flag. */
 	uint32_t 	ivhd_efr;	/* ACPI v1 Reserved or v2 EFR . */
 	/* PCI related. */
 	uint16_t 	cap_off;	/* PCI Capability offset. */
 	uint8_t		pci_cap;	/* PCI capability. */
 	uint64_t 	pci_efr;	/* PCI EFR for rev2.0 */
 	uint16_t 	pci_seg;	/* IOMMU PCI domain/segment. */
 	uint16_t 	pci_rid;	/* PCI BDF of IOMMU */
 	/* Device range under this IOMMU. */
 	uint16_t 	start_dev_rid;	/* First device under this IOMMU. */
 	uint16_t 	end_dev_rid;	/* Last device under this IOMMU. */
 
 	/* BIOS provided device configuration for end points. */
 	struct 		ivhd_dev_cfg dev_cfg[10];
 	int		dev_cfg_cnt;
 
 	/* Software statistics. */
 	uint64_t 	event_intr_cnt;	/* Total event INTR count. */
 	uint64_t 	total_cmd;	/* Total number of commands. */
 };
 
 int	amdvi_setup_hw(struct amdvi_softc *softc);
 int	amdvi_teardown_hw(struct amdvi_softc *softc);
 #endif /* _AMDVI_PRIV_H_ */
Index: projects/bsd_rdma_4_9/sys/amd64/vmm/amd/ivrs_drv.c
===================================================================
--- projects/bsd_rdma_4_9/sys/amd64/vmm/amd/ivrs_drv.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/amd64/vmm/amd/ivrs_drv.c	(revision 326162)
@@ -1,500 +1,512 @@
 /*-
  * Copyright (c) 2016, Anish Gupta (anish@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 
 #include <machine/vmparam.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
 #include <dev/acpica/acpivar.h>
 
 #include "io/iommu.h"
 #include "amdvi_priv.h"
 
 device_t *ivhd_devs;			/* IVHD or AMD-Vi device list. */
 int	ivhd_count;			/* Number of IVHD or AMD-Vi devices. */
 
 extern int amdvi_ptp_level;		/* Page table levels. */
 
 typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER * ptr, void *arg);
 
 /*
  * Iterate IVRS table for IVHD and IVMD device type.
  */
 static void
 ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg)
 {
 	ACPI_TABLE_IVRS *ivrs;
 	ACPI_IVRS_HEADER *ivrs_hdr, *end;
 	ACPI_STATUS status;
 
 	status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs);
 	if (ACPI_FAILURE(status))
 		return;
 
 	if (ivrs->Header.Length == 0) {
 		return;
 	}
 
 	ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1);
 	end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length);
 
 	while (ivrs_hdr < end) {
 		if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) {
 			printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n",
 			    ivrs_hdr->Length);
 			break;
 		}
 
 		switch (ivrs_hdr->Type) {
 		case ACPI_IVRS_TYPE_HARDWARE:	/* Legacy */
 		case 0x11:
 		case 0x40: 			/* ACPI HID */
 			if (!iter(ivrs_hdr, arg))
 				return;
 			break;
-		
+
 		case ACPI_IVRS_TYPE_MEMORY1:
 		case ACPI_IVRS_TYPE_MEMORY2:
 		case ACPI_IVRS_TYPE_MEMORY3:
 			if (!iter(ivrs_hdr, arg))
 				return;
 
 			break;
-		
+
 		default:
 			printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type);
 
 		}
 
 		ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr +
 			ivrs_hdr->Length);
 	}
 }
 
 static  int
 ivrs_is_ivhd(UINT8 type)
 {
 
 	if ((type == ACPI_IVRS_TYPE_HARDWARE) || (type == 0x11)	|| (type == 0x40))
 		return (1);
 
 	return (0);
 }
 
 /* Count the number of AMD-Vi devices in the system. */
 static int
 ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg)
 {
 
 	if (ivrs_is_ivhd(ivrs_he->Type))
 		ivhd_count++;
 
 	return (1);
 }
 
 struct find_ivrs_hdr_args {
 	int	i;
 	ACPI_IVRS_HEADER *ptr;
 };
 
 static int
 ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args)
 {
 	struct find_ivrs_hdr_args *fi;
 
 	fi = (struct find_ivrs_hdr_args *)args;
 	if (ivrs_is_ivhd(ivrs_hdr->Type)) {
 		if (fi->i == 0) {
 			fi->ptr = ivrs_hdr;
 			return (0);
 		}
 		fi->i--;
 	}
 
 	return (1);
 }
 
 static ACPI_IVRS_HARDWARE *
 ivhd_find_by_index(int idx)
 {
 	struct find_ivrs_hdr_args fi;
 
 	fi.i = idx;
 	fi.ptr = NULL;
 
 	ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi);
 
 	return ((ACPI_IVRS_HARDWARE *)fi.ptr);
 }
 
 static void
 ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id,
     uint32_t end_id, uint8_t cfg, bool ats)
 {
 	struct ivhd_dev_cfg *dev_cfg;
 
 	/* If device doesn't have special data, don't add it. */
 	if (!cfg)
 		return;
 
 	dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++];
 	dev_cfg->start_id = start_id;
 	dev_cfg->end_id = end_id;
 	dev_cfg->data = cfg;
 	dev_cfg->enable_ats = ats;
 }
 
 /*
  * Record device attributes as suggested by BIOS.
  */
 static int
 ivhd_dev_parse(ACPI_IVRS_HARDWARE * ivhd, struct amdvi_softc *softc)
 {
-	ACPI_IVRS_DE_HEADER *de, *end;
+	ACPI_IVRS_DE_HEADER *de;
+	uint8_t *p, *end;
 	int range_start_id = 0, range_end_id = 0;
 	uint32_t *extended;
 	uint8_t all_data = 0, range_data = 0;
 	bool range_enable_ats = false, enable_ats;
 
 	softc->start_dev_rid = ~0;
 	softc->end_dev_rid = 0;
 
-	de = (ACPI_IVRS_DE_HEADER *) ((uint8_t *)ivhd +
-	    sizeof(ACPI_IVRS_HARDWARE));
-	end = (ACPI_IVRS_DE_HEADER *) ((uint8_t *)ivhd +
-	    ivhd->Header.Length);
+	/*
+	 * XXX The following actually depends on Header.Type and
+	 * is only true for 0x10.
+	 */
+	p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE);
+	end = (uint8_t *)ivhd + ivhd->Header.Length;
 
-	while (de < (ACPI_IVRS_DE_HEADER *) end) {
+	while (p < end) {
+		de = (ACPI_IVRS_DE_HEADER *)p;
 		softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id);
 		softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id);
 		switch (de->Type) {
 		case ACPI_IVRS_TYPE_ALL:
 			all_data = de->DataSetting;
 			break;
 
 		case ACPI_IVRS_TYPE_SELECT:
 		case ACPI_IVRS_TYPE_ALIAS_SELECT:
 		case ACPI_IVRS_TYPE_EXT_SELECT:
 			enable_ats = false;
 			if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) {
 				extended = (uint32_t *)(de + 1);
 				enable_ats =
 				    (*extended & IVHD_DEV_EXT_ATS_DISABLE) ?
 					false : true;
 			}
 			ivhd_dev_add_entry(softc, de->Id, de->Id,
 			    de->DataSetting | all_data, enable_ats);
 			break;
 
 		case ACPI_IVRS_TYPE_START:
 		case ACPI_IVRS_TYPE_ALIAS_START:
 		case ACPI_IVRS_TYPE_EXT_START:
 			range_start_id = de->Id;
 			range_data = de->DataSetting;
 			if (de->Type == ACPI_IVRS_TYPE_EXT_START) {
 				extended = (uint32_t *)(de + 1);
 				range_enable_ats =
 				    (*extended & IVHD_DEV_EXT_ATS_DISABLE) ?
 					false : true;
 			}
 			break;
 
 		case ACPI_IVRS_TYPE_END:
 			range_end_id = de->Id;
 			ivhd_dev_add_entry(softc, range_start_id, range_end_id,
 				range_data | all_data, range_enable_ats);
 			range_start_id = range_end_id = 0;
 			range_data = 0;
 			all_data = 0;
 			break;
 
 		case ACPI_IVRS_TYPE_PAD4:
 			break;
 
 		case ACPI_IVRS_TYPE_SPECIAL:
 			/* HPET or IOAPIC */
 			break;
 		default:
 			if ((de->Type < 5) ||
 			    (de->Type >= ACPI_IVRS_TYPE_PAD8))
 				device_printf(softc->dev,
 				    "Unknown dev entry:0x%x\n", de->Type);
 		}
 
 		if (softc->dev_cfg_cnt >
 			(sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) {
 			device_printf(softc->dev,
 			    "WARN Too many device entries.\n");
 			return (EINVAL);
 		}
-		de++;
+		if (de->Type < 0x40)
+			p += sizeof(ACPI_IVRS_DEVICE4);
+		else if (de->Type < 0x80)
+			p += sizeof(ACPI_IVRS_DEVICE8A);
+		else {
+			printf("Variable size IVHD type 0x%x not supported\n",
+			    de->Type);
+			break;
+		}
 	}
 
 	KASSERT((softc->end_dev_rid >= softc->start_dev_rid),
 	    ("Device end[0x%x] < start[0x%x.\n",
 	    softc->end_dev_rid, softc->start_dev_rid));
 
 	return (0);
 }
 
 static void
 ivhd_identify(driver_t *driver, device_t parent)
 {
 	ACPI_TABLE_IVRS *ivrs;
 	ACPI_IVRS_HARDWARE *ivhd;
 	ACPI_STATUS status;
 	uint32_t info;
 	int i, count = 0;
 
 	if (acpi_disabled("ivhd"))
 		return;
 
 	status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs);
 	if (ACPI_FAILURE(status))
 		return;
 
 	if (ivrs->Header.Length == 0) {
 		return;
 	}
 
 	info = ivrs->Info;
 	printf("AMD-Vi IVRS VAsize = %d PAsize = %d GVAsize = %d flags:%b\n",
 		REG_BITS(info, 21, 15), REG_BITS(info, 14, 8), 
 		REG_BITS(info, 7, 5), REG_BITS(info, 22, 22),
 		"\020\001HtAtsResv");
 
 	ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL);
 	if (!ivhd_count)
 		return;
 
 	ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF,
 		M_WAITOK | M_ZERO);
 	for (i = 0; i < ivhd_count; i++) {
 		ivhd = ivhd_find_by_index(i);
 		if (ivhd == NULL) {
 			printf("Can't find IVHD entry%d\n", i);
 			continue;
 		}
 
 		ivhd_devs[i] = BUS_ADD_CHILD(parent, 1, "ivhd", i);
 		/*
 		 * XXX: In case device was not destroyed before, add will fail.
 		 * locate the old device instance.
 		 */
 		if (ivhd_devs[i] == NULL) {
 			ivhd_devs[i] = device_find_child(parent, "ivhd", i);
 			if (ivhd_devs[i] == NULL) {
 				printf("AMD-Vi: cant find AMD-Vi dev%d\n", i);
 				break;
 			}
 		}
 		count++;
 	}
 
 	/*
 	 * Update device count in case failed to attach.
 	 */
 	ivhd_count = count;
 }
 
 static int
 ivhd_probe(device_t dev)
 {
 
 	if (acpi_get_handle(dev) != NULL)
 		return (ENXIO);
 	device_set_desc(dev, "AMD-Vi/IOMMU or ivhd");
 
 	return (BUS_PROBE_NOWILDCARD);
 }
 
 static int
 ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd)
 {
 	device_t dev;
 	int max_ptp_level;
 
 	dev = softc->dev;
 	device_printf(dev, "Flag:%b\n", softc->ivhd_flag,
 	    "\020\001HtTunEn\002PassPW\003ResPassPW\004Isoc\005IotlbSup"
 	    "\006Coherent\007PreFSup\008PPRSup");
 	/*
 	 * If no extended feature[EFR], its rev1 with maximum paging level as 7.
 	 */
 	max_ptp_level = 7;
 	if (softc->ivhd_efr) {
 		device_printf(dev, "EFR HATS = %d GATS = %d GLXSup = %d "
 		    "MsiNumPr = %d PNBanks= %d PNCounters= %d\n"
 		    "max PASID = %d EFR: %b \n",
 		    REG_BITS(softc->ivhd_efr, 31, 30),
 		    REG_BITS(softc->ivhd_efr, 29, 28),
 		    REG_BITS(softc->ivhd_efr, 4, 3),
 		    REG_BITS(softc->ivhd_efr, 27, 23),
 		    REG_BITS(softc->ivhd_efr, 22, 17),
 		    REG_BITS(softc->ivhd_efr, 16, 13),
 		    REG_BITS(softc->ivhd_efr, 12, 8),
 		    softc->ivhd_efr, "\020\001XTSup\002NXSup\003GTSup\005IASup"
 		    "\006GASup\007HESup\008PPRSup");
 
 		max_ptp_level = REG_BITS(softc->ivhd_efr, 31, 30) + 4;
 	}
 
 	/* Make sure device support minimum page level as requested by user. */
 	if (max_ptp_level < amdvi_ptp_level) {
 		device_printf(dev, "Insufficient PTP level:%d\n",
 		    max_ptp_level);
 		return (EINVAL);
 	}
 
 	device_printf(softc->dev, "max supported paging level:%d restricting to: %d\n",
 	    max_ptp_level, amdvi_ptp_level);
 	device_printf(softc->dev, "device supported range "
 	    "[0x%x - 0x%x]\n", softc->start_dev_rid, softc->end_dev_rid);
 
 	return (0);
 }
 
 static int
 ivhd_attach(device_t dev)
 {
 	ACPI_IVRS_HARDWARE *ivhd;
 	struct amdvi_softc *softc;
 	int status, unit;
 
 	unit = device_get_unit(dev);
 	/* Make sure its same device for which attach is called. */
 	if (ivhd_devs[unit] != dev)
 		panic("Not same device old %p new %p", ivhd_devs[unit], dev);
 
 	softc = device_get_softc(dev);
 	softc->dev = dev;
 	ivhd = ivhd_find_by_index(unit);
 	if (ivhd == NULL)
 		return (EINVAL);
 
 	softc->pci_seg = ivhd->PciSegmentGroup;
 	softc->pci_rid = ivhd->Header.DeviceId;
 	softc->ivhd_flag = ivhd->Header.Flags;
 	softc->ivhd_efr = ivhd->Reserved;
 	/* 
 	 * PCI capability has more capabilities that are not part of IVRS.
 	 */
 	softc->cap_off = ivhd->CapabilityOffset;
 
 #ifdef notyet
 	/* IVHD Info bit[4:0] is event MSI/X number. */
 	softc->event_msix = ivhd->Info & 0x1F;
 #endif
 	softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress);
 	status = ivhd_dev_parse(ivhd, softc);
 	if (status != 0) {
 		device_printf(dev,
 		    "endpoint device parsing error=%d\n", status);
 	}
 
 	status = ivhd_print_cap(softc, ivhd);
 	if (status != 0) {
 		return (status);
 	}
 
 	status = amdvi_setup_hw(softc);
 	if (status != 0) {
 		device_printf(dev, "couldn't be initialised, error=%d\n", 
 		    status);
 		return (status);
 	}
 
 	return (0);
 }
 
 static int
 ivhd_detach(device_t dev)
 {
 	struct amdvi_softc *softc;
 
 	softc = device_get_softc(dev);
 
 	amdvi_teardown_hw(softc);
 
 	/*
 	 * XXX: delete the device.
 	 * don't allow detach, return EBUSY.
 	 */
 	return (0);
 }
 
 static int
 ivhd_suspend(device_t dev)
 {
 
 	return (0);
 }
 
 static int
 ivhd_resume(device_t dev)
 {
 
 	return (0);
 }
 
 static device_method_t ivhd_methods[] = {
 	DEVMETHOD(device_identify, ivhd_identify),
 	DEVMETHOD(device_probe, ivhd_probe),
 	DEVMETHOD(device_attach, ivhd_attach),
 	DEVMETHOD(device_detach, ivhd_detach),
 	DEVMETHOD(device_suspend, ivhd_suspend),
 	DEVMETHOD(device_resume, ivhd_resume),
 	DEVMETHOD_END
 };
 
 static driver_t ivhd_driver = {
 	"ivhd",
 	ivhd_methods,
 	sizeof(struct amdvi_softc),
 };
 
 static devclass_t ivhd_devclass;
 
 /*
  * Load this module at the end after PCI re-probing to configure interrupt.
  */
 DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0,
 		      SI_ORDER_ANY);
 MODULE_DEPEND(ivhd, acpi, 1, 1, 1);
 MODULE_DEPEND(ivhd, pci, 1, 1, 1);
Index: projects/bsd_rdma_4_9/sys/arm/allwinner/clkng/aw_ccung.c
===================================================================
--- projects/bsd_rdma_4_9/sys/arm/allwinner/clkng/aw_ccung.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/arm/allwinner/clkng/aw_ccung.c	(revision 326162)
@@ -1,412 +1,428 @@
 /*-
  * Copyright (c) 2017 Emmanuel Vadot <manu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Allwinner Clock Control Unit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 
 #include <dev/fdt/simplebus.h>
 
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/extres/clk/clk.h>
 #include <dev/extres/clk/clk_gate.h>
 
 #include <dev/extres/hwreset/hwreset.h>
 
 #include <arm/allwinner/clkng/aw_ccung.h>
 #include <arm/allwinner/clkng/aw_clk.h>
 
 #ifdef __aarch64__
 #include "opt_soc.h"
 #endif
 
 #if defined(SOC_ALLWINNER_A13)
 #include <arm/allwinner/clkng/ccu_a13.h>
 #endif
 
 #if defined(SOC_ALLWINNER_A31)
 #include <arm/allwinner/clkng/ccu_a31.h>
 #endif
 
 #if defined(SOC_ALLWINNER_A64)
 #include <arm/allwinner/clkng/ccu_a64.h>
 #include <arm/allwinner/clkng/ccu_sun8i_r.h>
 #endif
 
 #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5)
 #include <arm/allwinner/clkng/ccu_h3.h>
 #include <arm/allwinner/clkng/ccu_sun8i_r.h>
 #endif
 
+#if defined(SOC_ALLWINNER_A83T)
+#include <arm/allwinner/clkng/ccu_a83t.h>
+#endif
+
 #include "clkdev_if.h"
 #include "hwreset_if.h"
 
 static struct resource_spec aw_ccung_spec[] = {
 	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },
 	{ -1, 0 }
 };
 
 #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5)
 #define	H3_CCU		1
 #define	H3_R_CCU	2
 #endif
 
 #if defined(SOC_ALLWINNER_A31)
 #define	A31_CCU		3
 #endif
 
 #if defined(SOC_ALLWINNER_A64)
 #define	A64_CCU		4
 #define	A64_R_CCU	5
 #endif
 
 #if defined(SOC_ALLWINNER_A13)
 #define	A13_CCU		6
 #endif
 
+#if defined(SOC_ALLWINNER_A83T)
+#define	A83T_CCU	7
+#endif
+
 static struct ofw_compat_data compat_data[] = {
 #if defined(SOC_ALLWINNER_A31)
 	{ "allwinner,sun5i-a13-ccu", A13_CCU},
 #endif
 #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5)
 	{ "allwinner,sun8i-h3-ccu", H3_CCU },
 	{ "allwinner,sun8i-h3-r-ccu", H3_R_CCU },
 #endif
 #if defined(SOC_ALLWINNER_A31)
 	{ "allwinner,sun6i-a31-ccu", A31_CCU },
 #endif
 #if defined(SOC_ALLWINNER_A64)
 	{ "allwinner,sun50i-a64-ccu", A64_CCU },
 	{ "allwinner,sun50i-a64-r-ccu", A64_R_CCU },
 #endif
+#if defined(SOC_ALLWINNER_A83T)
+	{ "allwinner,sun8i-a83t-ccu", A83T_CCU },
+#endif
 	{NULL, 0 }
 };
 
 #define	CCU_READ4(sc, reg)		bus_read_4((sc)->res, (reg))
 #define	CCU_WRITE4(sc, reg, val)	bus_write_4((sc)->res, (reg), (val))
 
 static int
 aw_ccung_write_4(device_t dev, bus_addr_t addr, uint32_t val)
 {
 	struct aw_ccung_softc *sc;
 
 	sc = device_get_softc(dev);
 	CCU_WRITE4(sc, addr, val);
 	return (0);
 }
 
 static int
 aw_ccung_read_4(device_t dev, bus_addr_t addr, uint32_t *val)
 {
 	struct aw_ccung_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	*val = CCU_READ4(sc, addr);
 	return (0);
 }
 
 static int
 aw_ccung_modify_4(device_t dev, bus_addr_t addr, uint32_t clr, uint32_t set)
 {
 	struct aw_ccung_softc *sc;
 	uint32_t reg;
 
 	sc = device_get_softc(dev);
 
 	reg = CCU_READ4(sc, addr);
 	reg &= ~clr;
 	reg |= set;
 	CCU_WRITE4(sc, addr, reg);
 
 	return (0);
 }
 
 static int
 aw_ccung_reset_assert(device_t dev, intptr_t id, bool reset)
 {
 	struct aw_ccung_softc *sc;
 	uint32_t val;
 
 	sc = device_get_softc(dev);
 
 	if (id >= sc->nresets || sc->resets[id].offset == 0)
 		return (0);
 
 	mtx_lock(&sc->mtx);
 	val = CCU_READ4(sc, sc->resets[id].offset);
 	if (reset)
 		val &= ~(1 << sc->resets[id].shift);
 	else
 		val |= 1 << sc->resets[id].shift;
 	CCU_WRITE4(sc, sc->resets[id].offset, val);
 	mtx_unlock(&sc->mtx);
 
 	return (0);
 }
 
 static int
 aw_ccung_reset_is_asserted(device_t dev, intptr_t id, bool *reset)
 {
 	struct aw_ccung_softc *sc;
 	uint32_t val;
 
 	sc = device_get_softc(dev);
 
 	if (id >= sc->nresets || sc->resets[id].offset == 0)
 		return (0);
 
 	mtx_lock(&sc->mtx);
 	val = CCU_READ4(sc, sc->resets[id].offset);
 	*reset = (val & (1 << sc->resets[id].shift)) != 0 ? false : true;
 	mtx_unlock(&sc->mtx);
 
 	return (0);
 }
 
 static void
 aw_ccung_device_lock(device_t dev)
 {
 	struct aw_ccung_softc *sc;
 
 	sc = device_get_softc(dev);
 	mtx_lock(&sc->mtx);
 }
 
 static void
 aw_ccung_device_unlock(device_t dev)
 {
 	struct aw_ccung_softc *sc;
 
 	sc = device_get_softc(dev);
 	mtx_unlock(&sc->mtx);
 }
 
 static int
 aw_ccung_probe(device_t dev)
 {
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
 		return (ENXIO);
 
 	device_set_desc(dev, "Allwinner Clock Control Unit NG");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 aw_ccung_register_gates(struct aw_ccung_softc *sc)
 {
 	struct clk_gate_def def;
 	int i;
 
 	for (i = 0; i < sc->ngates; i++) {
 		if (sc->gates[i].name == NULL)
 			continue;
 		memset(&def, 0, sizeof(def));
 		def.clkdef.id = i;
 		def.clkdef.name = sc->gates[i].name;
 		def.clkdef.parent_names = &sc->gates[i].parent_name;
 		def.clkdef.parent_cnt = 1;
 		def.offset = sc->gates[i].offset;
 		def.shift = sc->gates[i].shift;
 		def.mask = 1;
 		def.on_value = 1;
 		def.off_value = 0;
 		clknode_gate_register(sc->clkdom, &def);
 	}
 
 	return (0);
 }
 
 static void
 aw_ccung_init_clocks(struct aw_ccung_softc *sc)
 {
 	struct clknode *clknode;
 	int i, error;
 
 	for (i = 0; i < sc->n_clk_init; i++) {
 		clknode = clknode_find_by_name(sc->clk_init[i].name);
 		if (clknode == NULL) {
 			device_printf(sc->dev, "Cannot find clock %s\n",
 			    sc->clk_init[i].name);
 			continue;
 		}
 
 		if (sc->clk_init[i].parent_name != NULL) {
 			if (bootverbose)
 				device_printf(sc->dev, "Setting %s as parent for %s\n",
 				    sc->clk_init[i].parent_name,
 				    sc->clk_init[i].name);
 			error = clknode_set_parent_by_name(clknode,
 			    sc->clk_init[i].parent_name);
 			if (error != 0) {
 				device_printf(sc->dev,
 				    "Cannot set parent to %s for %s\n",
 				    sc->clk_init[i].parent_name,
 				    sc->clk_init[i].name);
 				continue;
 			}
 		}
 		if (sc->clk_init[i].default_freq != 0) {
 			error = clknode_set_freq(clknode,
 			    sc->clk_init[i].default_freq, 0 , 0);
 			if (error != 0) {
 				device_printf(sc->dev,
 				    "Cannot set frequency for %s to %ju\n",
 				    sc->clk_init[i].name,
 				    sc->clk_init[i].default_freq);
 				continue;
 			}
 		}
 		if (sc->clk_init[i].enable) {
 			error = clknode_enable(clknode);
 			if (error != 0) {
 				device_printf(sc->dev,
 				    "Cannot enable %s\n",
 				    sc->clk_init[i].name);
 				continue;
 			}
 		}
 	}
 }
 
 static int
 aw_ccung_attach(device_t dev)
 {
 	struct aw_ccung_softc *sc;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 
 	if (bus_alloc_resources(dev, aw_ccung_spec, &sc->res) != 0) {
 		device_printf(dev, "cannot allocate resources for device\n");
 		return (ENXIO);
 	}
 
 	mtx_init(&sc->mtx, device_get_nameunit(dev), NULL, MTX_DEF);
 
 	sc->type = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
 
 	sc->clkdom = clkdom_create(dev);
 	if (sc->clkdom == NULL)
 		panic("Cannot create clkdom\n");
 
 	switch (sc->type) {
 #if defined(SOC_ALLWINNER_A13)
 	case A13_CCU:
 		ccu_a13_register_clocks(sc);
 		break;
 #endif
 #if defined(SOC_ALLWINNER_H3) || defined(SOC_ALLWINNER_H5)
 	case H3_CCU:
 		ccu_h3_register_clocks(sc);
 		break;
 	case H3_R_CCU:
 		ccu_sun8i_r_register_clocks(sc);
 		break;
 #endif
 #if defined(SOC_ALLWINNER_A31)
 	case A31_CCU:
 		ccu_a31_register_clocks(sc);
 		break;
 #endif
 #if defined(SOC_ALLWINNER_A64)
 	case A64_CCU:
 		ccu_a64_register_clocks(sc);
 		break;
 	case A64_R_CCU:
 		ccu_sun8i_r_register_clocks(sc);
+		break;
+#endif
+#if defined(SOC_ALLWINNER_A83T)
+	case A83T_CCU:
+		ccu_a83t_register_clocks(sc);
 		break;
 #endif
 	}
 
 	if (sc->gates)
 		aw_ccung_register_gates(sc);
 	if (clkdom_finit(sc->clkdom) != 0)
 		panic("cannot finalize clkdom initialization\n");
 
 	clkdom_xlock(sc->clkdom);
 	aw_ccung_init_clocks(sc);
 	clkdom_unlock(sc->clkdom);
 
 	if (bootverbose)
 		clkdom_dump(sc->clkdom);
 
 	/* If we have resets, register our self as a reset provider */
 	if (sc->resets)
 		hwreset_register_ofw_provider(dev);
 
 	return (0);
 }
 
 static device_method_t aw_ccung_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		aw_ccung_probe),
 	DEVMETHOD(device_attach,	aw_ccung_attach),
 
 	/* clkdev interface */
 	DEVMETHOD(clkdev_write_4,	aw_ccung_write_4),
 	DEVMETHOD(clkdev_read_4,	aw_ccung_read_4),
 	DEVMETHOD(clkdev_modify_4,	aw_ccung_modify_4),
 	DEVMETHOD(clkdev_device_lock,	aw_ccung_device_lock),
 	DEVMETHOD(clkdev_device_unlock,	aw_ccung_device_unlock),
 
 	/* Reset interface */
 	DEVMETHOD(hwreset_assert,	aw_ccung_reset_assert),
 	DEVMETHOD(hwreset_is_asserted,	aw_ccung_reset_is_asserted),
 
 	DEVMETHOD_END
 };
 
 static driver_t aw_ccung_driver = {
 	"aw_ccung",
 	aw_ccung_methods,
 	sizeof(struct aw_ccung_softc),
 };
 
 static devclass_t aw_ccung_devclass;
 
 EARLY_DRIVER_MODULE(aw_ccung, simplebus, aw_ccung_driver, aw_ccung_devclass,
     0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
 MODULE_VERSION(aw_ccung, 1);
Index: projects/bsd_rdma_4_9/sys/arm/arm/machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/arm/arm/machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/arm/arm/machdep.c	(revision 326162)
@@ -1,1243 +1,1247 @@
 /*	$NetBSD: arm32_machdep.c,v 1.44 2004/03/24 15:34:47 atatat Exp $	*/
 
 /*-
  * Copyright (c) 2004 Olivier Houchard
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Mark Brinicombe
  *	for the NetBSD Project.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Machine dependent functions for kernel setup
  *
  * Created      : 17/09/94
  * Updated	: 18/04/01 updated for new wscons
  */
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 #include "opt_timer.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/devmap.h>
 #include <sys/efi.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/msgbuf.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <machine/debug_monitor.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/pcb.h>
 #include <machine/physmem.h>
 #include <machine/platform.h>
 #include <machine/sysarch.h>
 #include <machine/undefined.h>
 #include <machine/vfp.h>
 #include <machine/vmparam.h>
 
 #ifdef FDT
 #include <dev/fdt/fdt_common.h>
 #include <machine/ofw_machdep.h>
 #endif
 
 #ifdef DEBUG
 #define	debugf(fmt, args...) printf(fmt, ##args)
 #else
 #define	debugf(fmt, args...)
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) || \
     defined(COMPAT_FREEBSD9)
 #error FreeBSD/arm doesn't provide compatibility with releases prior to 10
 #endif
 
 #if __ARM_ARCH >= 6 && !defined(INTRNG)
 #error armv6 requires INTRNG
 #endif
 
 struct pcpu __pcpu[MAXCPU];
 struct pcpu *pcpup = &__pcpu[0];
 
 static struct trapframe proc0_tf;
 uint32_t cpu_reset_address = 0;
 int cold = 1;
 vm_offset_t vector_page;
 
 int (*_arm_memcpy)(void *, void *, int, int) = NULL;
 int (*_arm_bzero)(void *, int, int) = NULL;
 int _min_memcpy_size = 0;
 int _min_bzero_size = 0;
 
 extern int *end;
 
 #ifdef FDT
 vm_paddr_t pmap_pa;
 #if __ARM_ARCH >= 6
 vm_offset_t systempage;
 vm_offset_t irqstack;
 vm_offset_t undstack;
 vm_offset_t abtstack;
 #else
 /*
  * This is the number of L2 page tables required for covering max
  * (hypothetical) memsize of 4GB and all kernel mappings (vectors, msgbuf,
  * stacks etc.), uprounded to be divisible by 4.
  */
 #define KERNEL_PT_MAX	78
 static struct pv_addr kernel_pt_table[KERNEL_PT_MAX];
 struct pv_addr systempage;
 static struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 static struct pv_addr kernelstack;
 #endif /* __ARM_ARCH >= 6 */
 #endif /* FDT */
 
 #ifdef PLATFORM
 static delay_func *delay_impl;
 static void *delay_arg;
 #endif
 
 struct kva_md_info kmi;
 
 /*
  * arm32_vector_init:
  *
  *	Initialize the vector page, and select whether or not to
  *	relocate the vectors.
  *
  *	NOTE: We expect the vector page to be mapped at its expected
  *	destination.
  */
 
 extern unsigned int page0[], page0_data[];
 void
 arm_vector_init(vm_offset_t va, int which)
 {
 	unsigned int *vectors = (int *) va;
 	unsigned int *vectors_data = vectors + (page0_data - page0);
 	int vec;
 
 	/*
 	 * Loop through the vectors we're taking over, and copy the
 	 * vector's insn and data word.
 	 */
 	for (vec = 0; vec < ARM_NVEC; vec++) {
 		if ((which & (1 << vec)) == 0) {
 			/* Don't want to take over this vector. */
 			continue;
 		}
 		vectors[vec] = page0[vec];
 		vectors_data[vec] = page0_data[vec];
 	}
 
 	/* Now sync the vectors. */
 	icache_sync(va, (ARM_NVEC * 2) * sizeof(u_int));
 
 	vector_page = va;
 #if __ARM_ARCH < 6
 	if (va == ARM_VECTORS_HIGH) {
 		/*
 		 * Enable high vectors in the system control reg (SCTLR).
 		 *
 		 * Assume the MD caller knows what it's doing here, and really
 		 * does want the vector page relocated.
 		 *
 		 * Note: This has to be done here (and not just in
 		 * cpu_setup()) because the vector page needs to be
 		 * accessible *before* cpu_startup() is called.
 		 * Think ddb(9) ...
 		 */
 		cpu_control(CPU_CONTROL_VECRELOC, CPU_CONTROL_VECRELOC);
 	}
 #endif
 }
 
 static void
 cpu_startup(void *dummy)
 {
 	struct pcb *pcb = thread0.td_pcb;
 	const unsigned int mbyte = 1024 * 1024;
 #if __ARM_ARCH < 6 && !defined(ARM_CACHE_LOCK_ENABLE)
 	vm_page_t m;
 #endif
 
 	identify_arm_cpu();
 
 	vm_ksubmap_init(&kmi);
 
 	/*
 	 * Display the RAM layout.
 	 */
 	printf("real memory  = %ju (%ju MB)\n",
 	    (uintmax_t)arm32_ptob(realmem),
 	    (uintmax_t)arm32_ptob(realmem) / mbyte);
 	printf("avail memory = %ju (%ju MB)\n",
 	    (uintmax_t)arm32_ptob(vm_cnt.v_free_count),
 	    (uintmax_t)arm32_ptob(vm_cnt.v_free_count) / mbyte);
 	if (bootverbose) {
 		arm_physmem_print_tables();
 		devmap_print_table();
 	}
 
 	bufinit();
 	vm_pager_bufferinit();
 	pcb->pcb_regs.sf_sp = (u_int)thread0.td_kstack +
 	    USPACE_SVC_STACK_TOP;
 	pmap_set_pcb_pagedir(kernel_pmap, pcb);
 #if __ARM_ARCH < 6
 	vector_page_setprot(VM_PROT_READ);
 	pmap_postinit();
 #ifdef ARM_CACHE_LOCK_ENABLE
 	pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
 	arm_lock_cache_line(ARM_TP_ADDRESS);
 #else
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_ZERO);
 	pmap_kenter_user(ARM_TP_ADDRESS, VM_PAGE_TO_PHYS(m));
 #endif
 	*(uint32_t *)ARM_RAS_START = 0;
 	*(uint32_t *)ARM_RAS_END = 0xffffffff;
 #endif
 }
 
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 
 	dcache_wb_poc((vm_offset_t)ptr, (vm_paddr_t)vtophys(ptr), len);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 void
 cpu_idle(int busy)
 {
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu);
 	spinlock_enter();
 #ifndef NO_EVENTTIMERS
 	if (!busy)
 		cpu_idleclock();
 #endif
 	if (!sched_runnable())
 		cpu_sleep(0);
 #ifndef NO_EVENTTIMERS
 	if (!busy)
 		cpu_activeclock();
 #endif
 	spinlock_exit();
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 
 	return (0);
 }
 
 #ifdef NO_EVENTTIMERS
 /*
  * Most ARM platforms don't need to do anything special to init their clocks
  * (they get intialized during normal device attachment), and by not defining a
  * cpu_initclocks() function they get this generic one.  Any platform that needs
  * to do something special can just provide their own implementation, which will
  * override this one due to the weak linkage.
  */
 void
 arm_generic_initclocks(void)
 {
 }
 __weak_reference(arm_generic_initclocks, cpu_initclocks);
 
 #else
 void
 cpu_initclocks(void)
 {
 
 #ifdef SMP
 	if (PCPU_GET(cpuid) == 0)
 		cpu_initclocks_bsp();
 	else
 		cpu_initclocks_ap();
 #else
 	cpu_initclocks_bsp();
 #endif
 }
 #endif
 
 #ifdef PLATFORM
 void
 arm_set_delay(delay_func *impl, void *arg)
 {
 
 	KASSERT(impl != NULL, ("No DELAY implementation"));
 	delay_impl = impl;
 	delay_arg = arg;
 }
 
 void
 DELAY(int usec)
 {
 
 	delay_impl(usec, delay_arg);
 }
 #endif
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t cspr;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		cspr = disable_interrupts(PSR_I | PSR_F);
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_cspr = cspr;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t cspr;
 
 	td = curthread;
 	critical_exit();
 	cspr = td->td_md.md_saved_cspr;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		restore_interrupts(cspr);
 }
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *tf = td->td_frame;
 
 	memset(tf, 0, sizeof(*tf));
 	tf->tf_usr_sp = stack;
 	tf->tf_usr_lr = imgp->entry_addr;
 	tf->tf_svc_lr = 0x77777777;
 	tf->tf_pc = imgp->entry_addr;
 	tf->tf_spsr = PSR_USR32_MODE;
 }
 
 
 #ifdef VFP
 /*
  * Get machine VFP context.
  */
 void
 get_vfpcontext(struct thread *td, mcontext_vfp_t *vfp)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if (td == curthread) {
 		critical_enter();
 		vfp_store(&pcb->pcb_vfpstate, false);
 		critical_exit();
 	} else
 		MPASS(TD_IS_SUSPENDED(td));
 	memcpy(vfp->mcv_reg, pcb->pcb_vfpstate.reg,
 	    sizeof(vfp->mcv_reg));
 	vfp->mcv_fpscr = pcb->pcb_vfpstate.fpscr;
 }
 
 /*
  * Set machine VFP context.
  */
 void
 set_vfpcontext(struct thread *td, mcontext_vfp_t *vfp)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if (td == curthread) {
 		critical_enter();
 		vfp_discard(td);
 		critical_exit();
 	} else
 		MPASS(TD_IS_SUSPENDED(td));
 	memcpy(pcb->pcb_vfpstate.reg, vfp->mcv_reg,
 	    sizeof(pcb->pcb_vfpstate.reg));
 	pcb->pcb_vfpstate.fpscr = vfp->mcv_fpscr;
 }
 #endif
 
 int
 arm_get_vfpstate(struct thread *td, void *args)
 {
 	int rv;
 	struct arm_get_vfpstate_args ua;
 	mcontext_vfp_t	mcontext_vfp;
 
 	rv = copyin(args, &ua, sizeof(ua));
 	if (rv != 0)
 		return (rv);
 	if (ua.mc_vfp_size != sizeof(mcontext_vfp_t))
 		return (EINVAL);
 #ifdef VFP
 	get_vfpcontext(td, &mcontext_vfp);
 #else
 	bzero(&mcontext_vfp, sizeof(mcontext_vfp));
 #endif
 
 	rv = copyout(&mcontext_vfp, ua.mc_vfp,  sizeof(mcontext_vfp));
 	if (rv != 0)
 		return (rv);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
 {
 	struct trapframe *tf = td->td_frame;
 	__greg_t *gr = mcp->__gregs;
 
 	if (clear_ret & GET_MC_CLEAR_RET) {
 		gr[_REG_R0] = 0;
 		gr[_REG_CPSR] = tf->tf_spsr & ~PSR_C;
 	} else {
 		gr[_REG_R0]   = tf->tf_r0;
 		gr[_REG_CPSR] = tf->tf_spsr;
 	}
 	gr[_REG_R1]   = tf->tf_r1;
 	gr[_REG_R2]   = tf->tf_r2;
 	gr[_REG_R3]   = tf->tf_r3;
 	gr[_REG_R4]   = tf->tf_r4;
 	gr[_REG_R5]   = tf->tf_r5;
 	gr[_REG_R6]   = tf->tf_r6;
 	gr[_REG_R7]   = tf->tf_r7;
 	gr[_REG_R8]   = tf->tf_r8;
 	gr[_REG_R9]   = tf->tf_r9;
 	gr[_REG_R10]  = tf->tf_r10;
 	gr[_REG_R11]  = tf->tf_r11;
 	gr[_REG_R12]  = tf->tf_r12;
 	gr[_REG_SP]   = tf->tf_usr_sp;
 	gr[_REG_LR]   = tf->tf_usr_lr;
 	gr[_REG_PC]   = tf->tf_pc;
 
 	mcp->mc_vfp_size = 0;
 	mcp->mc_vfp_ptr = NULL;
 	memset(&mcp->mc_spare, 0, sizeof(mcp->mc_spare));
 
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	mcontext_vfp_t mc_vfp, *vfp;
 	struct trapframe *tf = td->td_frame;
 	const __greg_t *gr = mcp->__gregs;
+	int spsr;
 
+	/*
+	 * Make sure the processor mode has not been tampered with and
+	 * interrupts have not been disabled.
+	 */
+	spsr = gr[_REG_CPSR];
+	if ((spsr & PSR_MODE) != PSR_USR32_MODE ||
+	    (spsr & (PSR_I | PSR_F)) != 0)
+		return (EINVAL);
+
 #ifdef WITNESS
 	if (mcp->mc_vfp_size != 0 && mcp->mc_vfp_size != sizeof(mc_vfp)) {
 		printf("%s: %s: Malformed mc_vfp_size: %d (0x%08X)\n",
 		    td->td_proc->p_comm, __func__,
 		    mcp->mc_vfp_size, mcp->mc_vfp_size);
 	} else if (mcp->mc_vfp_size != 0 && mcp->mc_vfp_ptr == NULL) {
 		printf("%s: %s: c_vfp_size != 0 but mc_vfp_ptr == NULL\n",
 		    td->td_proc->p_comm, __func__);
 	}
 #endif
 
 	if (mcp->mc_vfp_size == sizeof(mc_vfp) && mcp->mc_vfp_ptr != NULL) {
 		if (copyin(mcp->mc_vfp_ptr, &mc_vfp, sizeof(mc_vfp)) != 0)
 			return (EFAULT);
 		vfp = &mc_vfp;
 	} else {
 		vfp = NULL;
 	}
 
 	tf->tf_r0 = gr[_REG_R0];
 	tf->tf_r1 = gr[_REG_R1];
 	tf->tf_r2 = gr[_REG_R2];
 	tf->tf_r3 = gr[_REG_R3];
 	tf->tf_r4 = gr[_REG_R4];
 	tf->tf_r5 = gr[_REG_R5];
 	tf->tf_r6 = gr[_REG_R6];
 	tf->tf_r7 = gr[_REG_R7];
 	tf->tf_r8 = gr[_REG_R8];
 	tf->tf_r9 = gr[_REG_R9];
 	tf->tf_r10 = gr[_REG_R10];
 	tf->tf_r11 = gr[_REG_R11];
 	tf->tf_r12 = gr[_REG_R12];
 	tf->tf_usr_sp = gr[_REG_SP];
 	tf->tf_usr_lr = gr[_REG_LR];
 	tf->tf_pc = gr[_REG_PC];
 	tf->tf_spsr = gr[_REG_CPSR];
 #ifdef VFP
 	if (vfp != NULL)
 		set_vfpcontext(td, vfp);
 #endif
 	return (0);
 }
 
 void
 sendsig(catcher, ksi, mask)
 	sig_t catcher;
 	ksiginfo_t *ksi;
 	sigset_t *mask;
 {
 	struct thread *td;
 	struct proc *p;
 	struct trapframe *tf;
 	struct sigframe *fp, frame;
 	struct sigacts *psp;
 	struct sysentvec *sysent;
 	int onstack;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	onstack = sigonstack(tf->tf_usr_sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !(onstack) &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct sigframe *)td->td_frame->tf_usr_sp;
 
 	/* make room on the stack */
 	fp--;
 
 	/* make the stack aligned */
 	fp = (struct sigframe *)STACKALIGN(fp);
 	/* Populate the siginfo frame. */
 	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
 #ifdef VFP
 	get_vfpcontext(td, &frame.sf_vfp);
 	frame.sf_uc.uc_mcontext.mc_vfp_size = sizeof(fp->sf_vfp);
 	frame.sf_uc.uc_mcontext.mc_vfp_ptr = &fp->sf_vfp;
 #else
 	frame.sf_uc.uc_mcontext.mc_vfp_size = 0;
 	frame.sf_uc.uc_mcontext.mc_vfp_ptr = NULL;
 #endif
 	frame.sf_si = ksi->ksi_info;
 	frame.sf_uc.uc_sigmask = *mask;
 	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK )
 	    ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	frame.sf_uc.uc_stack = td->td_sigstk;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(td->td_proc);
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
 		/* Process has trashed its stack. Kill it. */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.  We invoke the handler
 	 * directly, only returning via the trampoline.  Note the
 	 * trampoline version numbers are coordinated with machine-
 	 * dependent code in libc.
 	 */
 
 	tf->tf_r0 = sig;
 	tf->tf_r1 = (register_t)&fp->sf_si;
 	tf->tf_r2 = (register_t)&fp->sf_uc;
 
 	/* the trampoline uses r5 as the uc address */
 	tf->tf_r5 = (register_t)&fp->sf_uc;
 	tf->tf_pc = (register_t)catcher;
 	tf->tf_usr_sp = (register_t)fp;
 	sysent = p->p_sysent;
 	if (sysent->sv_sigcode_base != 0)
 		tf->tf_usr_lr = (register_t)sysent->sv_sigcode_base;
 	else
 		tf->tf_usr_lr = (register_t)(sysent->sv_psstrings -
 		    *(sysent->sv_szsigcode));
 	/* Set the mode to enter in the signal handler */
 #if __ARM_ARCH >= 7
 	if ((register_t)catcher & 1)
 		tf->tf_spsr |= PSR_T;
 	else
 		tf->tf_spsr &= ~PSR_T;
 #endif
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_usr_lr,
 	    tf->tf_usr_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
-	int spsr;
+	int error;
 
 	if (uap == NULL)
 		return (EFAULT);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
 		return (EFAULT);
-	/*
-	 * Make sure the processor mode has not been tampered with and
-	 * interrupts have not been disabled.
-	 */
-	spsr = uc.uc_mcontext.__gregs[_REG_CPSR];
-	if ((spsr & PSR_MODE) != PSR_USR32_MODE ||
-	    (spsr & (PSR_I | PSR_F)) != 0)
-		return (EINVAL);
 	/* Restore register context. */
-	set_mcontext(td, &uc.uc_mcontext);
+	error = set_mcontext(td, &uc.uc_mcontext);
+	if (error != 0)
+		return (error);
 
 	/* Restore signal mask. */
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 	pcb->pcb_regs.sf_r4 = tf->tf_r4;
 	pcb->pcb_regs.sf_r5 = tf->tf_r5;
 	pcb->pcb_regs.sf_r6 = tf->tf_r6;
 	pcb->pcb_regs.sf_r7 = tf->tf_r7;
 	pcb->pcb_regs.sf_r8 = tf->tf_r8;
 	pcb->pcb_regs.sf_r9 = tf->tf_r9;
 	pcb->pcb_regs.sf_r10 = tf->tf_r10;
 	pcb->pcb_regs.sf_r11 = tf->tf_r11;
 	pcb->pcb_regs.sf_r12 = tf->tf_r12;
 	pcb->pcb_regs.sf_pc = tf->tf_pc;
 	pcb->pcb_regs.sf_lr = tf->tf_usr_lr;
 	pcb->pcb_regs.sf_sp = tf->tf_usr_sp;
 }
 
 void
 pcpu0_init(void)
 {
 #if __ARM_ARCH >= 6
 	set_curthread(&thread0);
 #endif
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 }
 
 /*
  * Initialize proc0
  */
 void
 init_proc0(vm_offset_t kstack)
 {
 	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kstack;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + kstack_pages * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_pcb->pcb_vfpcpu = -1;
 	thread0.td_pcb->pcb_vfpstate.fpscr = VFPSCR_DN;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 }
 
 #if __ARM_ARCH >= 6
 void
 set_stackptrs(int cpu)
 {
 
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_UND32_MODE,
 	    undstack + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 }
 #else
 void
 set_stackptrs(int cpu)
 {
 
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 }
 #endif
 
 
 #ifdef FDT
 #if __ARM_ARCH < 6
 void *
 initarm(struct arm_boot_params *abp)
 {
 	struct mem_region mem_regions[FDT_MEM_REGIONS];
 	struct pv_addr kernel_l1pt;
 	struct pv_addr dpcpu;
 	vm_offset_t dtbp, freemempos, l2_start, lastaddr;
 	uint64_t memsize;
 	uint32_t l2size;
 	char *env;
 	void *kmdp;
 	u_int l1pagetable;
 	int i, j, err_devmap, mem_regions_sz;
 
 	lastaddr = parse_boot_param(abp);
 	arm_physmem_kernaddr = abp->abp_physaddr;
 
 	memsize = 0;
 
 	cpuinfo_init();
 	set_cpufuncs();
 
 	/*
 	 * Find the dtb passed in by the boot loader.
 	 */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp != NULL)
 		dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
 	else
 		dtbp = (vm_offset_t)NULL;
 
 #if defined(FDT_DTB_STATIC)
 	/*
 	 * In case the device tree blob was not retrieved (from metadata) try
 	 * to use the statically embedded one.
 	 */
 	if (dtbp == (vm_offset_t)NULL)
 		dtbp = (vm_offset_t)&fdt_static_dtb;
 #endif
 
 	if (OF_install(OFW_FDT, 0) == FALSE)
 		panic("Cannot install FDT");
 
 	if (OF_init((void *)dtbp) != 0)
 		panic("OF_init failed with the found device tree");
 
 	/* Grab physical memory regions information from device tree. */
 	if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, &memsize) != 0)
 		panic("Cannot get physical memory regions");
 	arm_physmem_hardware_regions(mem_regions, mem_regions_sz);
 
 	/* Grab reserved memory regions information from device tree. */
 	if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0)
 		arm_physmem_exclude_regions(mem_regions, mem_regions_sz,
 		    EXFLAG_NODUMP | EXFLAG_NOALLOC);
 
 	/* Platform-specific initialisation */
 	platform_probe_and_attach();
 
 	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 
 	/* Calculate number of L2 tables needed for mapping vm_page_array */
 	l2size = (memsize / PAGE_SIZE) * sizeof(struct vm_page);
 	l2size = (l2size >> L1_S_SHIFT) + 1;
 
 	/*
 	 * Add one table for end of kernel map, one for stacks, msgbuf and
 	 * L1 and L2 tables map,  one for vectors map and two for
 	 * l2 structures from pmap_bootstrap.
 	 */
 	l2size += 5;
 
 	/* Make it divisible by 4 */
 	l2size = (l2size + 3) & ~3;
 
 	freemempos = (lastaddr + PAGE_MASK) & ~PAGE_MASK;
 
 	/* Define a macro to simplify memory allocation */
 #define valloc_pages(var, np)						\
 	alloc_pages((var).pv_va, (np));					\
 	(var).pv_pa = (var).pv_va + (abp->abp_physaddr - KERNVIRTADDR);
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos += PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 
 	for (i = 0, j = 0; i < l2size; ++i) {
 		if (!(i % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[i],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 			j = i;
 		} else {
 			kernel_pt_table[i].pv_va = kernel_pt_table[j].pv_va +
 			    L2_TABLE_SIZE_REAL * (i - j);
 			kernel_pt_table[i].pv_pa =
 			    kernel_pt_table[i].pv_va - KERNVIRTADDR +
 			    abp->abp_physaddr;
 
 		}
 	}
 	/*
 	 * Allocate a page for the system page mapped to 0x00000000
 	 * or 0xffff0000. This page will just contain the system vectors
 	 * and can be shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate dynamic per-cpu area. */
 	valloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu.pv_va, 0);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE * MAXCPU);
 	valloc_pages(abtstack, ABT_STACK_SIZE * MAXCPU);
 	valloc_pages(undstack, UND_STACK_SIZE * MAXCPU);
 	valloc_pages(kernelstack, kstack_pages * MAXCPU);
 	valloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/*
 	 * Try to map as much as possible of kernel text and data using
 	 * 1MB section mapping and for the rest of initial kernel address
 	 * space use L2 coarse tables.
 	 *
 	 * Link L2 tables for mapping remainder of kernel (modulo 1MB)
 	 * and kernel structures
 	 */
 	l2_start = lastaddr & ~(L1_S_OFFSET);
 	for (i = 0 ; i < l2size - 1; i++)
 		pmap_link_l2pt(l1pagetable, l2_start + i * L1_S_SIZE,
 		    &kernel_pt_table[i]);
 
 	pmap_curmaxkvaddr = l2_start + (l2size - 1) * L1_S_SIZE;
 
 	/* Map kernel code and data */
 	pmap_map_chunk(l1pagetable, KERNVIRTADDR, abp->abp_physaddr,
 	   (((uint32_t)(lastaddr) - KERNVIRTADDR) + PAGE_MASK) & ~PAGE_MASK,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	/* Map L1 directory and allocated L2 page tables */
 	pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa,
 	    L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 
 	pmap_map_chunk(l1pagetable, kernel_pt_table[0].pv_va,
 	    kernel_pt_table[0].pv_pa,
 	    L2_TABLE_SIZE_REAL * l2size,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 
 	/* Map allocated DPCPU, stacks and msgbuf */
 	pmap_map_chunk(l1pagetable, dpcpu.pv_va, dpcpu.pv_pa,
 	    freemempos - dpcpu.pv_va,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	/* Link and map the vector page */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH,
 	    &kernel_pt_table[l2size - 1]);
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, PTE_CACHE);
 
 	/* Establish static device mappings. */
 	err_devmap = platform_devmap_init();
 	devmap_bootstrap(l1pagetable, NULL);
 	vm_max_kernel_address = platform_lastaddr();
 
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) | DOMAIN_CLIENT);
 	pmap_pa = kernel_l1pt.pv_pa;
 	cpu_setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2));
 
 	/*
 	 * Now that proper page tables are installed, call cpu_setup() to enable
 	 * instruction and data caches and other chip-specific features.
 	 */
 	cpu_setup();
 
 	/*
 	 * Only after the SOC registers block is mapped we can perform device
 	 * tree fixups, as they may attempt to read parameters from hardware.
 	 */
 	OF_interpret("perform-fixup", 0);
 
 	platform_gpio_init();
 
 	cninit();
 
 	debugf("initarm: console initialized\n");
 	debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp);
 	debugf(" boothowto = 0x%08x\n", boothowto);
 	debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp);
 	arm_print_kenv();
 
 	env = kern_getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	if (err_devmap != 0)
 		printf("WARNING: could not fully configure devmap, error=%d\n",
 		    err_devmap);
 
 	platform_late_init();
 
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 	cpu_control(CPU_CONTROL_MMU_ENABLE, CPU_CONTROL_MMU_ENABLE);
 
 	set_stackptrs(0);
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in cpu_setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross relocations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 
 	undefined_init();
 
 	init_proc0(kernelstack.pv_va);
 
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 	pmap_bootstrap(freemempos, &kernel_l1pt);
 	msgbufp = (void *)msgbufpv.pv_va;
 	msgbufinit(msgbufp, msgbufsize);
 	mutex_init();
 
 	/*
 	 * Exclude the kernel (and all the things we allocated which immediately
 	 * follow the kernel) from the VM allocation pool but not from crash
 	 * dumps.  virtual_avail is a global variable which tracks the kva we've
 	 * "allocated" while setting up pmaps.
 	 *
 	 * Prepare the list of physical memory available to the vm subsystem.
 	 */
 	arm_physmem_exclude_region(abp->abp_physaddr,
 	    (virtual_avail - KERNVIRTADDR), EXFLAG_NOALLOC);
 	arm_physmem_init_kernel_globals();
 
 	init_param2(physmem);
 	dbg_monitor_init();
 	kdb_init();
 
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
 #else /* __ARM_ARCH < 6 */
 void *
 initarm(struct arm_boot_params *abp)
 {
 	struct mem_region mem_regions[FDT_MEM_REGIONS];
 	vm_paddr_t lastaddr;
 	vm_offset_t dtbp, kernelstack, dpcpu;
 	char *env;
 	void *kmdp;
 	int err_devmap, mem_regions_sz;
 #ifdef EFI
 	struct efi_map_header *efihdr;
 #endif
 
 	/* get last allocated physical address */
 	arm_physmem_kernaddr = abp->abp_physaddr;
 	lastaddr = parse_boot_param(abp) - KERNVIRTADDR + arm_physmem_kernaddr;
 
 	set_cpufuncs();
 	cpuinfo_init();
 
 	/*
 	 * Find the dtb passed in by the boot loader.
 	 */
 	kmdp = preload_search_by_type("elf kernel");
 	dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
 #if defined(FDT_DTB_STATIC)
 	/*
 	 * In case the device tree blob was not retrieved (from metadata) try
 	 * to use the statically embedded one.
 	 */
 	if (dtbp == (vm_offset_t)NULL)
 		dtbp = (vm_offset_t)&fdt_static_dtb;
 #endif
 
 	if (OF_install(OFW_FDT, 0) == FALSE)
 		panic("Cannot install FDT");
 
 	if (OF_init((void *)dtbp) != 0)
 		panic("OF_init failed with the found device tree");
 
 #if defined(LINUX_BOOT_ABI)
 	arm_parse_fdt_bootargs();
 #endif
 
 #ifdef EFI
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr != NULL) {
 		arm_add_efi_map_entries(efihdr, mem_regions, &mem_regions_sz);
 	} else
 #endif
 	{
 		/* Grab physical memory regions information from device tree. */
 		if (fdt_get_mem_regions(mem_regions, &mem_regions_sz,NULL) != 0)
 			panic("Cannot get physical memory regions");
 	}
 	arm_physmem_hardware_regions(mem_regions, mem_regions_sz);
 
 	/* Grab reserved memory regions information from device tree. */
 	if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0)
 		arm_physmem_exclude_regions(mem_regions, mem_regions_sz,
 		    EXFLAG_NODUMP | EXFLAG_NOALLOC);
 
 	/*
 	 * Set TEX remapping registers.
 	 * Setup kernel page tables and switch to kernel L1 page table.
 	 */
 	pmap_set_tex();
 	pmap_bootstrap_prepare(lastaddr);
 
 	/*
 	 * Now that proper page tables are installed, call cpu_setup() to enable
 	 * instruction and data caches and other chip-specific features.
 	 */
 	cpu_setup();
 
 	/* Platform-specific initialisation */
 	platform_probe_and_attach();
 	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 
 	/*
 	 * Allocate a page for the system page mapped to 0xffff0000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	systempage = pmap_preboot_get_pages(1);
 
 	/* Map the vector page. */
 	pmap_preboot_map_pages(systempage, ARM_VECTORS_HIGH,  1);
 	if (virtual_end >= ARM_VECTORS_HIGH)
 		virtual_end = ARM_VECTORS_HIGH - 1;
 
 	/* Allocate dynamic per-cpu area. */
 	dpcpu = pmap_preboot_get_vpages(DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate stacks for all modes */
 	irqstack    = pmap_preboot_get_vpages(IRQ_STACK_SIZE * MAXCPU);
 	abtstack    = pmap_preboot_get_vpages(ABT_STACK_SIZE * MAXCPU);
 	undstack    = pmap_preboot_get_vpages(UND_STACK_SIZE * MAXCPU );
 	kernelstack = pmap_preboot_get_vpages(kstack_pages * MAXCPU);
 
 	/* Allocate message buffer. */
 	msgbufp = (void *)pmap_preboot_get_vpages(
 	    round_page(msgbufsize) / PAGE_SIZE);
 
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 	set_stackptrs(0);
 	mutex_init();
 
 	/* Establish static device mappings. */
 	err_devmap = platform_devmap_init();
 	devmap_bootstrap(0, NULL);
 	vm_max_kernel_address = platform_lastaddr();
 
 	/*
 	 * Only after the SOC registers block is mapped we can perform device
 	 * tree fixups, as they may attempt to read parameters from hardware.
 	 */
 	OF_interpret("perform-fixup", 0);
 	platform_gpio_init();
 	cninit();
 
 	debugf("initarm: console initialized\n");
 	debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp);
 	debugf(" boothowto = 0x%08x\n", boothowto);
 	debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp);
 	debugf(" lastaddr1: 0x%08x\n", lastaddr);
 	arm_print_kenv();
 
 	env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	if (err_devmap != 0)
 		printf("WARNING: could not fully configure devmap, error=%d\n",
 		    err_devmap);
 
 	platform_late_init();
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in cpu_setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross relocations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	/* Set stack for exception handlers */
 	undefined_init();
 	init_proc0(kernelstack);
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 	enable_interrupts(PSR_A);
 	pmap_bootstrap(0);
 
 	/* Exclude the kernel (and all the things we allocated which immediately
 	 * follow the kernel) from the VM allocation pool but not from crash
 	 * dumps.  virtual_avail is a global variable which tracks the kva we've
 	 * "allocated" while setting up pmaps.
 	 *
 	 * Prepare the list of physical memory available to the vm subsystem.
 	 */
 	arm_physmem_exclude_region(abp->abp_physaddr,
 		pmap_preboot_get_pages(0) - abp->abp_physaddr, EXFLAG_NOALLOC);
 	arm_physmem_init_kernel_globals();
 
 	init_param2(physmem);
 	/* Init message buffer. */
 	msgbufinit(msgbufp, msgbufsize);
 	dbg_monitor_init();
 	kdb_init();
 	return ((void *)STACKALIGN(thread0.td_pcb));
 
 }
 
 #endif /* __ARM_ARCH < 6 */
 #endif /* FDT */
Index: projects/bsd_rdma_4_9/sys/arm/cloudabi32/cloudabi32_sysvec.c
===================================================================
--- projects/bsd_rdma_4_9/sys/arm/cloudabi32/cloudabi32_sysvec.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/arm/cloudabi32/cloudabi32_sysvec.c	(revision 326162)
@@ -1,197 +1,197 @@
 /*-
  * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/frame.h>
 #include <machine/pcb.h>
 #include <machine/vmparam.h>
 
 #include <compat/cloudabi/cloudabi_util.h>
 
 #include <compat/cloudabi32/cloudabi32_syscall.h>
 #include <compat/cloudabi32/cloudabi32_util.h>
 
 extern const char *cloudabi32_syscallnames[];
 extern struct sysent cloudabi32_sysent[];
 
 static void
 cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp,
     unsigned long stack)
 {
 	struct trapframe *regs;
 
 	exec_setregs(td, imgp, stack);
 
 	/*
 	 * The stack now contains a pointer to the TCB and the auxiliary
 	 * vector. Let r0 point to the auxiliary vector, and set
 	 * tpidrurw to the TCB.
 	 */
 	regs = td->td_frame;
-	regs->tf_r0 = td->td_retval[0] =
+	regs->tf_r0 =
 	    stack + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t));
 	(void)cpu_set_user_tls(td, (void *)stack);
 }
 
 static int
 cloudabi32_fetch_syscall_args(struct thread *td)
 {
 	struct trapframe *frame;
 	struct syscall_args *sa;
 	int error;
 
 	frame = td->td_frame;
 	sa = &td->td_sa;
 
 	/* Obtain system call number. */
 	sa->code = frame->tf_r12;
 	if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL)
 		return (ENOSYS);
 	sa->callp = &cloudabi32_sysent[sa->code];
 	sa->narg = sa->callp->sy_narg;
 
 	/* Fetch system call arguments from registers and the stack. */
 	sa->args[0] = frame->tf_r0;
 	sa->args[1] = frame->tf_r1;
 	sa->args[2] = frame->tf_r2;
 	sa->args[3] = frame->tf_r3;
 	if (sa->narg > 4) {
 		error = copyin((void *)td->td_frame->tf_usr_sp, &sa->args[4],
 		    (sa->narg - 4) * sizeof(register_t));
 		if (error != 0)
 			return (error);
 	}
 
 	/* Default system call return values. */
 	td->td_retval[0] = 0;
 	td->td_retval[1] = frame->tf_r1;
 	return (0);
 }
 
 static void
 cloudabi32_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame = td->td_frame;
 
 	switch (error) {
 	case 0:
 		/* System call succeeded. */
 		frame->tf_r0 = td->td_retval[0];
 		frame->tf_r1 = td->td_retval[1];
 		frame->tf_spsr &= ~PSR_C;
 		break;
 	case ERESTART:
 		/* Restart system call. */
 		frame->tf_pc -= 4;
 		break;
 	case EJUSTRETURN:
 		break;
 	default:
 		/* System call returned an error. */
 		frame->tf_r0 = cloudabi_convert_errno(error);
 		frame->tf_spsr |= PSR_C;
 		break;
 	}
 }
 
 static void
 cloudabi32_schedtail(struct thread *td)
 {
 	struct trapframe *frame = td->td_frame;
 
 	/*
 	 * Initial register values for processes returning from fork.
 	 * Make sure that we only set these values when forking, not
 	 * when creating a new thread.
 	 */
 	if ((td->td_pflags & TDP_FORKING) != 0) {
 		frame->tf_r0 = CLOUDABI_PROCESS_CHILD;
 		frame->tf_r1 = td->td_tid;
 	}
 }
 
 int
 cloudabi32_thread_setregs(struct thread *td,
     const cloudabi32_threadattr_t *attr, uint32_t tcb)
 {
 	struct trapframe *frame;
 	stack_t stack;
 
 	/* Perform standard register initialization. */
 	stack.ss_sp = TO_PTR(attr->stack);
 	stack.ss_size = attr->stack_len;
 	cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack);
 
 	/*
 	 * Pass in the thread ID of the new thread and the argument
 	 * pointer provided by the parent thread in as arguments to the
 	 * entry point.
 	 */
 	frame = td->td_frame;
 	frame->tf_r0 = td->td_tid;
 	frame->tf_r1 = attr->argument;
 
 	/* Set up TLS. */
 	return (cpu_set_user_tls(td, (void *)tcb));
 }
 
 static struct sysentvec cloudabi32_elf_sysvec = {
 	.sv_size		= CLOUDABI32_SYS_MAXSYSCALL,
 	.sv_table		= cloudabi32_sysent,
 	.sv_fixup		= cloudabi32_fixup,
 	.sv_name		= "CloudABI ELF32",
 	.sv_coredump		= elf32_coredump,
 	.sv_pagesize		= PAGE_SIZE,
 	.sv_minuser		= VM_MIN_ADDRESS,
 	.sv_maxuser		= VM_MAXUSER_ADDRESS,
 	.sv_stackprot		= VM_PROT_READ | VM_PROT_WRITE,
 	.sv_copyout_strings	= cloudabi32_copyout_strings,
 	.sv_setregs		= cloudabi32_proc_setregs,
 	.sv_flags		= SV_ABI_CLOUDABI | SV_CAPSICUM | SV_ILP32,
 	.sv_set_syscall_retval	= cloudabi32_set_syscall_retval,
 	.sv_fetch_syscall_args	= cloudabi32_fetch_syscall_args,
 	.sv_syscallnames	= cloudabi32_syscallnames,
 	.sv_schedtail		= cloudabi32_schedtail,
 };
 
 INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec);
 
 Elf32_Brandinfo cloudabi32_brand = {
 	.brand		= ELFOSABI_CLOUDABI,
 	.machine	= EM_ARM,
 	.sysvec		= &cloudabi32_elf_sysvec,
 	.flags		= BI_BRAND_ONLY_STATIC,
 };
Index: projects/bsd_rdma_4_9/sys/arm64/arm64/machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/arm64/arm64/machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/arm64/arm64/machdep.c	(revision 326162)
@@ -1,1180 +1,1179 @@
 /*-
  * Copyright (c) 2014 Andrew Turner
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include "opt_acpi.h"
 #include "opt_platform.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/devmap.h>
 #include <sys/efi.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h> 
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/msgbuf.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vdso.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 
 #include <machine/armreg.h>
 #include <machine/cpu.h>
 #include <machine/debug_monitor.h>
 #include <machine/kdb.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/reg.h>
 #include <machine/undefined.h>
 #include <machine/vmparam.h>
 
 #ifdef VFP
 #include <machine/vfp.h>
 #endif
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <machine/acpica_machdep.h>
 #endif
 
 #ifdef FDT
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 #endif
 
 
 enum arm64_bus arm64_bus_method = ARM64_BUS_NONE;
 
 struct pcpu __pcpu[MAXCPU];
 
 static struct trapframe proc0_tf;
 
 vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2];
 vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2];
 
 int early_boot = 1;
 int cold = 1;
 long realmem = 0;
 long Maxmem = 0;
 
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 vm_paddr_t physmap[PHYSMAP_SIZE];
 u_int physmap_idx;
 
 struct kva_md_info kmi;
 
 int64_t dcache_line_size;	/* The minimum D cache line size */
 int64_t icache_line_size;	/* The minimum I cache line size */
 int64_t idcache_line_size;	/* The minimum cache line size */
 int64_t dczva_line_size;	/* The size of cache line the dc zva zeroes */
 int has_pan;
 
 /*
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
 vm_paddr_t efi_systbl_phys;
 
 /* pagezero_* implementations are provided in support.S */
 void pagezero_simple(void *);
 void pagezero_cache(void *);
 
 /* pagezero_simple is default pagezero */
 void (*pagezero)(void *p) = pagezero_simple;
 
 static void
 pan_setup(void)
 {
 	uint64_t id_aa64mfr1;
 
 	id_aa64mfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
 	if (ID_AA64MMFR1_PAN(id_aa64mfr1) != ID_AA64MMFR1_PAN_NONE)
 		has_pan = 1;
 }
 
 void
 pan_enable(void)
 {
 
 	/*
 	 * The LLVM integrated assembler doesn't understand the PAN
 	 * PSTATE field. Because of this we need to manually create
 	 * the instruction in an asm block. This is equivalent to:
 	 * msr pan, #1
 	 *
 	 * This sets the PAN bit, stopping the kernel from accessing
 	 * memory when userspace can also access it unless the kernel
 	 * uses the userspace load/store instructions.
 	 */
 	if (has_pan) {
 		WRITE_SPECIALREG(sctlr_el1,
 		    READ_SPECIALREG(sctlr_el1) & ~SCTLR_SPAN);
 		__asm __volatile(".inst 0xd500409f | (0x1 << 8)");
 	}
 }
 
 static void
 cpu_startup(void *dummy)
 {
 
 	undef_init();
 	identify_cpu();
 
 	vm_ksubmap_init(&kmi);
 	bufinit();
 	vm_pager_bufferinit();
 }
 
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 int
 cpu_idle_wakeup(int cpu)
 {
 
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	regs->sp = frame->tf_sp;
 	regs->lr = frame->tf_lr;
 	regs->elr = frame->tf_elr;
 	regs->spsr = frame->tf_spsr;
 
 	memcpy(regs->x, frame->tf_x, sizeof(regs->x));
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	frame->tf_sp = regs->sp;
 	frame->tf_lr = regs->lr;
 	frame->tf_elr = regs->elr;
-	frame->tf_spsr = regs->spsr;
+	frame->tf_spsr &= ~PSR_FLAGS;
+	frame->tf_spsr |= regs->spsr & PSR_FLAGS;
 
 	memcpy(frame->tf_x, regs->x, sizeof(frame->tf_x));
 
 	return (0);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *regs)
 {
 #ifdef VFP
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
 		/*
 		 * If we have just been running VFP instructions we will
 		 * need to save the state to memcpy it below.
 		 */
 		if (td == curthread)
 			vfp_save_state(td, pcb);
 
 		KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate,
 		    ("Called fill_fpregs while the kernel is using the VFP"));
 		memcpy(regs->fp_q, pcb->pcb_fpustate.vfp_regs,
 		    sizeof(regs->fp_q));
 		regs->fp_cr = pcb->pcb_fpustate.vfp_fpcr;
 		regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr;
 	} else
 #endif
 		memset(regs->fp_q, 0, sizeof(regs->fp_q));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *regs)
 {
 #ifdef VFP
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate,
 	    ("Called set_fpregs while the kernel is using the VFP"));
 	memcpy(pcb->pcb_fpustate.vfp_regs, regs->fp_q, sizeof(regs->fp_q));
 	pcb->pcb_fpustate.vfp_fpcr = regs->fp_cr;
 	pcb->pcb_fpustate.vfp_fpsr = regs->fp_sr;
 #endif
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *regs)
 {
 
 	printf("ARM64TODO: fill_dbregs");
 	return (EDOOFUS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *regs)
 {
 
 	printf("ARM64TODO: set_dbregs");
 	return (EDOOFUS);
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	printf("ARM64TODO: ptrace_set_pc");
 	return (EDOOFUS);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	td->td_frame->tf_spsr |= PSR_SS;
 	td->td_pcb->pcb_flags |= PCB_SINGLE_STEP;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	td->td_frame->tf_spsr &= ~PSR_SS;
 	td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP;
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *tf = td->td_frame;
 
 	memset(tf, 0, sizeof(struct trapframe));
 
-	/*
-	 * We need to set x0 for init as it doesn't call
-	 * cpu_set_syscall_retval to copy the value. We also
-	 * need to set td_retval for the cases where we do.
-	 */
-	tf->tf_x[0] = td->td_retval[0] = stack;
+	tf->tf_x[0] = stack;
 	tf->tf_sp = STACKALIGN(stack);
 	tf->tf_lr = imgp->entry_addr;
 	tf->tf_elr = imgp->entry_addr;
 }
 
 /* Sanity check these are the same size, they will be memcpy'd to and fro */
 CTASSERT(sizeof(((struct trapframe *)0)->tf_x) ==
     sizeof((struct gpregs *)0)->gp_x);
 CTASSERT(sizeof(((struct trapframe *)0)->tf_x) ==
     sizeof((struct reg *)0)->x);
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
 {
 	struct trapframe *tf = td->td_frame;
 
 	if (clear_ret & GET_MC_CLEAR_RET) {
 		mcp->mc_gpregs.gp_x[0] = 0;
 		mcp->mc_gpregs.gp_spsr = tf->tf_spsr & ~PSR_C;
 	} else {
 		mcp->mc_gpregs.gp_x[0] = tf->tf_x[0];
 		mcp->mc_gpregs.gp_spsr = tf->tf_spsr;
 	}
 
 	memcpy(&mcp->mc_gpregs.gp_x[1], &tf->tf_x[1],
 	    sizeof(mcp->mc_gpregs.gp_x[1]) * (nitems(mcp->mc_gpregs.gp_x) - 1));
 
 	mcp->mc_gpregs.gp_sp = tf->tf_sp;
 	mcp->mc_gpregs.gp_lr = tf->tf_lr;
 	mcp->mc_gpregs.gp_elr = tf->tf_elr;
 
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct trapframe *tf = td->td_frame;
+	uint32_t spsr;
 
+	spsr = mcp->mc_gpregs.gp_spsr;
+	if ((spsr & PSR_M_MASK) != PSR_M_EL0t ||
+	    (spsr & (PSR_F | PSR_I | PSR_A | PSR_D)) != 0)
+		return (EINVAL); 
+
 	memcpy(tf->tf_x, mcp->mc_gpregs.gp_x, sizeof(tf->tf_x));
 
 	tf->tf_sp = mcp->mc_gpregs.gp_sp;
 	tf->tf_lr = mcp->mc_gpregs.gp_lr;
 	tf->tf_elr = mcp->mc_gpregs.gp_elr;
 	tf->tf_spsr = mcp->mc_gpregs.gp_spsr;
 
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifdef VFP
 	struct pcb *curpcb;
 
 	critical_enter();
 
 	curpcb = curthread->td_pcb;
 
 	if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
 		/*
 		 * If we have just been running VFP instructions we will
 		 * need to save the state to memcpy it below.
 		 */
 		vfp_save_state(td, curpcb);
 
 		KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate,
 		    ("Called get_fpcontext while the kernel is using the VFP"));
 		KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0,
 		    ("Non-userspace FPU flags set in get_fpcontext"));
 		memcpy(mcp->mc_fpregs.fp_q, curpcb->pcb_fpustate.vfp_regs,
 		    sizeof(mcp->mc_fpregs));
 		mcp->mc_fpregs.fp_cr = curpcb->pcb_fpustate.vfp_fpcr;
 		mcp->mc_fpregs.fp_sr = curpcb->pcb_fpustate.vfp_fpsr;
 		mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags;
 		mcp->mc_flags |= _MC_FP_VALID;
 	}
 
 	critical_exit();
 #endif
 }
 
 static void
 set_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifdef VFP
 	struct pcb *curpcb;
 
 	critical_enter();
 
 	if ((mcp->mc_flags & _MC_FP_VALID) != 0) {
 		curpcb = curthread->td_pcb;
 
 		/*
 		 * Discard any vfp state for the current thread, we
 		 * are about to override it.
 		 */
 		vfp_discard(td);
 
 		KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate,
 		    ("Called set_fpcontext while the kernel is using the VFP"));
 		memcpy(curpcb->pcb_fpustate.vfp_regs, mcp->mc_fpregs.fp_q,
 		    sizeof(mcp->mc_fpregs));
 		curpcb->pcb_fpustate.vfp_fpcr = mcp->mc_fpregs.fp_cr;
 		curpcb->pcb_fpustate.vfp_fpsr = mcp->mc_fpregs.fp_sr;
 		curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK;
 	}
 
 	critical_exit();
 #endif
 }
 
 void
 cpu_idle(int busy)
 {
 
 	spinlock_enter();
 	if (!busy)
 		cpu_idleclock();
 	if (!sched_runnable())
 		__asm __volatile(
 		    "dsb sy \n"
 		    "wfi    \n");
 	if (!busy)
 		cpu_activeclock();
 	spinlock_exit();
 }
 
 void
 cpu_halt(void)
 {
 
 	/* We should have shutdown by now, if not enter a low power sleep */
 	intr_disable();
 	while (1) {
 		__asm __volatile("wfi");
 	}
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 
 	/* ARM64TODO TBD */
 }
 
 /* Get current clock frequency for the given CPU ID. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	struct pcpu *pc;
 
 	pc = pcpu_find(cpu_id);
 	if (pc == NULL || rate == NULL)
 		return (EINVAL);
 
 	if (pc->pc_clock == 0)
 		return (EOPNOTSUPP);
 
 	*rate = pc->pc_clock;
 	return (0);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t daif;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		daif = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_daif = daif;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t daif;
 
 	td = curthread;
 	critical_exit();
 	daif = td->td_md.md_saved_daif;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(daif);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 int
 sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	ucontext_t uc;
-	uint32_t spsr;
+	int error;
 
 	if (uap == NULL)
 		return (EFAULT);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
 		return (EFAULT);
 
-	spsr = uc.uc_mcontext.mc_gpregs.gp_spsr;
-	if ((spsr & PSR_M_MASK) != PSR_M_EL0t ||
-	    (spsr & (PSR_F | PSR_I | PSR_A | PSR_D)) != 0)
-		return (EINVAL); 
-
-	set_mcontext(td, &uc.uc_mcontext);
+	error = set_mcontext(td, &uc.uc_mcontext);
+	if (error != 0)
+		return (error);
 	set_fpcontext(td, &uc.uc_mcontext);
 
 	/* Restore signal mask. */
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 	int i;
 
 	for (i = 0; i < PCB_LR; i++)
 		pcb->pcb_x[i] = tf->tf_x[i];
 
 	pcb->pcb_x[PCB_LR] = tf->tf_lr;
 	pcb->pcb_pc = tf->tf_elr;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct thread *td;
 	struct proc *p;
 	struct trapframe *tf;
 	struct sigframe *fp, frame;
 	struct sigacts *psp;
 	struct sysentvec *sysent;
 	int code, onstack, sig;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 
 	tf = td->td_frame;
 	onstack = sigonstack(tf->tf_sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else {
 		fp = (struct sigframe *)td->td_frame->tf_sp;
 	}
 
 	/* Make room, keeping the stack aligned */
 	fp--;
 	fp = (struct sigframe *)STACKALIGN(fp);
 
 	/* Fill in the frame to copy out */
 	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
 	get_fpcontext(td, &frame.sf_uc.uc_mcontext);
 	frame.sf_si = ksi->ksi_info;
 	frame.sf_uc.uc_sigmask = *mask;
 	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
 	    ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	frame.sf_uc.uc_stack = td->td_sigstk;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(td->td_proc);
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
 		/* Process has trashed its stack. Kill it. */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	tf->tf_x[0]= sig;
 	tf->tf_x[1] = (register_t)&fp->sf_si;
 	tf->tf_x[2] = (register_t)&fp->sf_uc;
 
 	tf->tf_elr = (register_t)catcher;
 	tf->tf_sp = (register_t)fp;
 	sysent = p->p_sysent;
 	if (sysent->sv_sigcode_base != 0)
 		tf->tf_lr = (register_t)sysent->sv_sigcode_base;
 	else
 		tf->tf_lr = (register_t)(sysent->sv_psstrings -
 		    *(sysent->sv_szsigcode));
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 static void
 init_proc0(vm_offset_t kstack)
 {
 	struct pcpu *pcpup = &__pcpu[0];
 
 	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kstack;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1;
 	thread0.td_pcb->pcb_fpflags = 0;
 	thread0.td_pcb->pcb_fpusaved = &thread0.td_pcb->pcb_fpustate;
 	thread0.td_pcb->pcb_vfpcpu = UINT_MAX;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 }
 
 typedef struct {
 	uint32_t type;
 	uint64_t phys_start;
 	uint64_t virt_start;
 	uint64_t num_pages;
 	uint64_t attr;
 } EFI_MEMORY_DESCRIPTOR;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     u_int *physmap_idxp)
 {
 	u_int i, insert_idx, _physmap_idx;
 
 	_physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = _physmap_idx;
 	for (i = 0; i <= _physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= _physmap_idx &&
 	    base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	_physmap_idx += 2;
 	*physmap_idxp = _physmap_idx;
 	if (_physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = _physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 #ifdef FDT
 static void
 add_fdt_mem_regions(struct mem_region *mr, int mrcnt, vm_paddr_t *physmap,
     u_int *physmap_idxp)
 {
 
 	for (int i = 0; i < mrcnt; i++) {
 		if (!add_physmap_entry(mr[i].mr_start, mr[i].mr_size, physmap,
 		    physmap_idxp))
 			break;
 	}
 }
 #endif
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     u_int *physmap_idxp)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode",
 		"PersistentMemory"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz); 
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type < nitems(types))
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_NV)
 				printf("NV ");
 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 				printf("MORE_RELIABLE ");
 			if (p->md_attr & EFI_MD_ATTR_RO)
 				printf("RO ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idxp))
 			break;
 	}
 }
 
 #ifdef FDT
 static void
 try_load_dtb(caddr_t kmdp)
 {
 	vm_offset_t dtbp;
 
 	dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
 	if (dtbp == (vm_offset_t)NULL) {
 		printf("ERROR loading DTB\n");
 		return;
 	}
 
 	if (OF_install(OFW_FDT, 0) == FALSE)
 		panic("Cannot install FDT");
 
 	if (OF_init((void *)dtbp) != 0)
 		panic("OF_init failed with the found device tree");
 }
 #endif
 
 static bool
 bus_probe(void)
 {
 	bool has_acpi, has_fdt;
 	char *order, *env;
 
 	has_acpi = has_fdt = false;
 
 #ifdef FDT
 	has_fdt = (OF_peer(0) != 0);
 #endif
 #ifdef DEV_ACPI
 	has_acpi = (acpi_find_table(ACPI_SIG_SPCR) != 0);
 #endif
 
 	env = kern_getenv("kern.cfg.order");
 	if (env != NULL) {
 		order = env;
 		while (order != NULL) {
 			if (has_acpi &&
 			    strncmp(order, "acpi", 4) == 0 &&
 			    (order[4] == ',' || order[4] == '\0')) {
 				arm64_bus_method = ARM64_BUS_ACPI;
 				break;
 			}
 			if (has_fdt &&
 			    strncmp(order, "fdt", 3) == 0 &&
 			    (order[3] == ',' || order[3] == '\0')) {
 				arm64_bus_method = ARM64_BUS_FDT;
 				break;
 			}
 			order = strchr(order, ',');
 		}
 		freeenv(env);
 
 		/* If we set the bus method it is valid */
 		if (arm64_bus_method != ARM64_BUS_NONE)
 			return (true);
 	}
 	/* If no order or an invalid order was set use the default */
 	if (arm64_bus_method == ARM64_BUS_NONE) {
 		if (has_fdt)
 			arm64_bus_method = ARM64_BUS_FDT;
 		else if (has_acpi)
 			arm64_bus_method = ARM64_BUS_ACPI;
 	}
 
 	/*
 	 * If no option was set the default is valid, otherwise we are
 	 * setting one to get cninit() working, then calling panic to tell
 	 * the user about the invalid bus setup.
 	 */
 	return (env == NULL);
 }
 
 static void
 cache_setup(void)
 {
 	int dcache_line_shift, icache_line_shift, dczva_line_shift;
 	uint32_t ctr_el0;
 	uint32_t dczid_el0;
 
 	ctr_el0 = READ_SPECIALREG(ctr_el0);
 
 	/* Read the log2 words in each D cache line */
 	dcache_line_shift = CTR_DLINE_SIZE(ctr_el0);
 	/* Get the D cache line size */
 	dcache_line_size = sizeof(int) << dcache_line_shift;
 
 	/* And the same for the I cache */
 	icache_line_shift = CTR_ILINE_SIZE(ctr_el0);
 	icache_line_size = sizeof(int) << icache_line_shift;
 
 	idcache_line_size = MIN(dcache_line_size, icache_line_size);
 
 	dczid_el0 = READ_SPECIALREG(dczid_el0);
 
 	/* Check if dc zva is not prohibited */
 	if (dczid_el0 & DCZID_DZP)
 		dczva_line_size = 0;
 	else {
 		/* Same as with above calculations */
 		dczva_line_shift = DCZID_BS_SIZE(dczid_el0);
 		dczva_line_size = sizeof(int) << dczva_line_shift;
 
 		/* Change pagezero function */
 		pagezero = pagezero_cache;
 	}
 }
 
 void
 initarm(struct arm64_bootparams *abp)
 {
 	struct efi_map_header *efihdr;
 	struct pcpu *pcpup;
 #ifdef FDT
 	struct mem_region mem_regions[FDT_MEM_REGIONS];
 	int mem_regions_sz;
 #endif
 	vm_offset_t lastaddr;
 	caddr_t kmdp;
 	vm_paddr_t mem_len;
 	bool valid;
 	int i;
 
 	/* Set the module data location */
 	preload_metadata = (caddr_t)(uintptr_t)(abp->modulep);
 
 	/* Find the kernel address */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0);
 
 #ifdef FDT
 	try_load_dtb(kmdp);
 #endif
 
 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	/* Find the address to start allocating from */
 	lastaddr = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 
 	/* Load the physical memory ranges */
 	physmap_idx = 0;
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr != NULL)
 		add_efi_map_entries(efihdr, physmap, &physmap_idx);
 #ifdef FDT
 	else {
 		/* Grab physical memory regions information from device tree. */
 		if (fdt_get_mem_regions(mem_regions, &mem_regions_sz,
 		    NULL) != 0)
 			panic("Cannot get physical memory regions");
 		add_fdt_mem_regions(mem_regions, mem_regions_sz, physmap,
 		    &physmap_idx);
 	}
 #endif
 
 	/* Print the memory map */
 	mem_len = 0;
 	for (i = 0; i < physmap_idx; i += 2) {
 		dump_avail[i] = physmap[i];
 		dump_avail[i + 1] = physmap[i + 1];
 		mem_len += physmap[i + 1] - physmap[i];
 	}
 	dump_avail[i] = 0;
 	dump_avail[i + 1] = 0;
 
 	/* Set the pcpu data, this is needed by pmap_bootstrap */
 	pcpup = &__pcpu[0];
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 
 	/*
 	 * Set the pcpu pointer with a backup in tpidr_el1 to be
 	 * loaded when entering the kernel from userland.
 	 */
 	__asm __volatile(
 	    "mov x18, %0 \n"
 	    "msr tpidr_el1, %0" :: "r"(pcpup));
 
 	PCPU_SET(curthread, &thread0);
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 
 	cache_setup();
 	pan_setup();
 
 	/* Bootstrap enough of pmap  to enter the kernel proper */
 	pmap_bootstrap(abp->kern_l0pt, abp->kern_l1pt,
 	    KERNBASE - abp->kern_delta, lastaddr - KERNBASE);
 
 	devmap_bootstrap(0, NULL);
 
 	valid = bus_probe();
 
 	cninit();
 
 	if (!valid)
 		panic("Invalid bus configuration: %s",
 		    kern_getenv("kern.cfg.order"));
 
 	init_proc0(abp->kern_stack);
 	msgbufinit(msgbufp, msgbufsize);
 	mutex_init();
 	init_param2(physmem);
 
 	dbg_init();
 	kdb_init();
 	pan_enable();
 
 	early_boot = 0;
 }
 
 void
 dbg_init(void)
 {
 
 	/* Clear OS lock */
 	WRITE_SPECIALREG(OSLAR_EL1, 0);
 
 	/* This permits DDB to use debug registers for watchpoints. */
 	dbg_monitor_init();
 
 	/* TODO: Eventually will need to initialize debug registers here. */
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(specialregs, db_show_spregs)
 {
 #define	PRINT_REG(reg)	\
     db_printf(__STRING(reg) " = %#016lx\n", READ_SPECIALREG(reg))
 
 	PRINT_REG(actlr_el1);
 	PRINT_REG(afsr0_el1);
 	PRINT_REG(afsr1_el1);
 	PRINT_REG(aidr_el1);
 	PRINT_REG(amair_el1);
 	PRINT_REG(ccsidr_el1);
 	PRINT_REG(clidr_el1);
 	PRINT_REG(contextidr_el1);
 	PRINT_REG(cpacr_el1);
 	PRINT_REG(csselr_el1);
 	PRINT_REG(ctr_el0);
 	PRINT_REG(currentel);
 	PRINT_REG(daif);
 	PRINT_REG(dczid_el0);
 	PRINT_REG(elr_el1);
 	PRINT_REG(esr_el1);
 	PRINT_REG(far_el1);
 #if 0
 	/* ARM64TODO: Enable VFP before reading floating-point registers */
 	PRINT_REG(fpcr);
 	PRINT_REG(fpsr);
 #endif
 	PRINT_REG(id_aa64afr0_el1);
 	PRINT_REG(id_aa64afr1_el1);
 	PRINT_REG(id_aa64dfr0_el1);
 	PRINT_REG(id_aa64dfr1_el1);
 	PRINT_REG(id_aa64isar0_el1);
 	PRINT_REG(id_aa64isar1_el1);
 	PRINT_REG(id_aa64pfr0_el1);
 	PRINT_REG(id_aa64pfr1_el1);
 	PRINT_REG(id_afr0_el1);
 	PRINT_REG(id_dfr0_el1);
 	PRINT_REG(id_isar0_el1);
 	PRINT_REG(id_isar1_el1);
 	PRINT_REG(id_isar2_el1);
 	PRINT_REG(id_isar3_el1);
 	PRINT_REG(id_isar4_el1);
 	PRINT_REG(id_isar5_el1);
 	PRINT_REG(id_mmfr0_el1);
 	PRINT_REG(id_mmfr1_el1);
 	PRINT_REG(id_mmfr2_el1);
 	PRINT_REG(id_mmfr3_el1);
 #if 0
 	/* Missing from llvm */
 	PRINT_REG(id_mmfr4_el1);
 #endif
 	PRINT_REG(id_pfr0_el1);
 	PRINT_REG(id_pfr1_el1);
 	PRINT_REG(isr_el1);
 	PRINT_REG(mair_el1);
 	PRINT_REG(midr_el1);
 	PRINT_REG(mpidr_el1);
 	PRINT_REG(mvfr0_el1);
 	PRINT_REG(mvfr1_el1);
 	PRINT_REG(mvfr2_el1);
 	PRINT_REG(revidr_el1);
 	PRINT_REG(sctlr_el1);
 	PRINT_REG(sp_el0);
 	PRINT_REG(spsel);
 	PRINT_REG(spsr_el1);
 	PRINT_REG(tcr_el1);
 	PRINT_REG(tpidr_el0);
 	PRINT_REG(tpidr_el1);
 	PRINT_REG(tpidrro_el0);
 	PRINT_REG(ttbr0_el1);
 	PRINT_REG(ttbr1_el1);
 	PRINT_REG(vbar_el1);
 #undef PRINT_REG
 }
 
 DB_SHOW_COMMAND(vtop, db_show_vtop)
 {
 	uint64_t phys;
 
 	if (have_addr) {
 		phys = arm64_address_translate_s1e1r(addr);
 		db_printf("EL1 physical address reg (read):  0x%016lx\n", phys);
 		phys = arm64_address_translate_s1e1w(addr);
 		db_printf("EL1 physical address reg (write): 0x%016lx\n", phys);
 		phys = arm64_address_translate_s1e0r(addr);
 		db_printf("EL0 physical address reg (read):  0x%016lx\n", phys);
 		phys = arm64_address_translate_s1e0w(addr);
 		db_printf("EL0 physical address reg (write): 0x%016lx\n", phys);
 	} else
 		db_printf("show vtop <virt_addr>\n");
 }
 #endif
Index: projects/bsd_rdma_4_9/sys/arm64/cloudabi64/cloudabi64_sysvec.c
===================================================================
--- projects/bsd_rdma_4_9/sys/arm64/cloudabi64/cloudabi64_sysvec.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/arm64/cloudabi64/cloudabi64_sysvec.c	(revision 326162)
@@ -1,189 +1,189 @@
 /*-
  * Copyright (c) 2015 Nuxi, https://nuxi.nl/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/frame.h>
 #include <machine/pcb.h>
 #include <machine/vmparam.h>
 
 #include <compat/cloudabi/cloudabi_util.h>
 
 #include <compat/cloudabi64/cloudabi64_syscall.h>
 #include <compat/cloudabi64/cloudabi64_util.h>
 
 extern const char *cloudabi64_syscallnames[];
 extern struct sysent cloudabi64_sysent[];
 
 static void
 cloudabi64_proc_setregs(struct thread *td, struct image_params *imgp,
     unsigned long stack)
 {
 	struct trapframe *regs;
 
 	exec_setregs(td, imgp, stack);
 
 	/*
 	 * The stack now contains a pointer to the TCB and the auxiliary
 	 * vector. Let x0 point to the auxiliary vector, and set
 	 * tpidr_el0 to the TCB.
 	 */
 	regs = td->td_frame;
-	regs->tf_x[0] = td->td_retval[0] =
+	regs->tf_x[0] =
 	    stack + roundup(sizeof(cloudabi64_tcb_t), sizeof(register_t));
 	(void)cpu_set_user_tls(td, (void *)stack);
 }
 
 static int
 cloudabi64_fetch_syscall_args(struct thread *td)
 {
 	struct trapframe *frame;
 	struct syscall_args *sa;
 	int i;
 
 	frame = td->td_frame;
 	sa = &td->td_sa;
 
 	/* Obtain system call number. */
 	sa->code = frame->tf_x[8];
 	if (sa->code >= CLOUDABI64_SYS_MAXSYSCALL)
 		return (ENOSYS);
 	sa->callp = &cloudabi64_sysent[sa->code];
 	sa->narg = sa->callp->sy_narg;
 
 	/* Fetch system call arguments. */
 	for (i = 0; i < MAXARGS; i++)
 		sa->args[i] = frame->tf_x[i];
 
 	/* Default system call return values. */
 	td->td_retval[0] = 0;
 	td->td_retval[1] = frame->tf_x[1];
 	return (0);
 }
 
 static void
 cloudabi64_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame = td->td_frame;
 
 	switch (error) {
 	case 0:
 		/* System call succeeded. */
 		frame->tf_x[0] = td->td_retval[0];
 		frame->tf_x[1] = td->td_retval[1];
 		frame->tf_spsr &= ~PSR_C;
 		break;
 	case ERESTART:
 		/* Restart system call. */
 		frame->tf_elr -= 4;
 		break;
 	case EJUSTRETURN:
 		break;
 	default:
 		/* System call returned an error. */
 		frame->tf_x[0] = cloudabi_convert_errno(error);
 		frame->tf_spsr |= PSR_C;
 		break;
 	}
 }
 
 static void
 cloudabi64_schedtail(struct thread *td)
 {
 	struct trapframe *frame = td->td_frame;
 
 	/*
 	 * Initial register values for processes returning from fork.
 	 * Make sure that we only set these values when forking, not
 	 * when creating a new thread.
 	 */
 	if ((td->td_pflags & TDP_FORKING) != 0) {
 		frame->tf_x[0] = CLOUDABI_PROCESS_CHILD;
 		frame->tf_x[1] = td->td_tid;
 	}
 }
 
 int
 cloudabi64_thread_setregs(struct thread *td,
     const cloudabi64_threadattr_t *attr, uint64_t tcb)
 {
 	struct trapframe *frame;
 	stack_t stack;
 
 	/* Perform standard register initialization. */
 	stack.ss_sp = TO_PTR(attr->stack);
 	stack.ss_size = attr->stack_len;
 	cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack);
 
 	/*
 	 * Pass in the thread ID of the new thread and the argument
 	 * pointer provided by the parent thread in as arguments to the
 	 * entry point.
 	 */
 	frame = td->td_frame;
 	frame->tf_x[0] = td->td_tid;
 	frame->tf_x[1] = attr->argument;
 
 	/* Set up TLS. */
 	return (cpu_set_user_tls(td, (void *)tcb));
 }
 
 static struct sysentvec cloudabi64_elf_sysvec = {
 	.sv_size		= CLOUDABI64_SYS_MAXSYSCALL,
 	.sv_table		= cloudabi64_sysent,
 	.sv_fixup		= cloudabi64_fixup,
 	.sv_name		= "CloudABI ELF64",
 	.sv_coredump		= elf64_coredump,
 	.sv_pagesize		= PAGE_SIZE,
 	.sv_minuser		= VM_MIN_ADDRESS,
 	.sv_maxuser		= VM_MAXUSER_ADDRESS,
 	.sv_stackprot		= VM_PROT_READ | VM_PROT_WRITE,
 	.sv_copyout_strings	= cloudabi64_copyout_strings,
 	.sv_setregs		= cloudabi64_proc_setregs,
 	.sv_flags		= SV_ABI_CLOUDABI | SV_CAPSICUM | SV_LP64,
 	.sv_set_syscall_retval	= cloudabi64_set_syscall_retval,
 	.sv_fetch_syscall_args	= cloudabi64_fetch_syscall_args,
 	.sv_syscallnames	= cloudabi64_syscallnames,
 	.sv_schedtail		= cloudabi64_schedtail,
 };
 
 INIT_SYSENTVEC(elf_sysvec, &cloudabi64_elf_sysvec);
 
 Elf64_Brandinfo cloudabi64_brand = {
 	.brand		= ELFOSABI_CLOUDABI,
 	.machine	= EM_AARCH64,
 	.sysvec		= &cloudabi64_elf_sysvec,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_ONLY_STATIC,
 };
Index: projects/bsd_rdma_4_9/sys/arm64/include/armreg.h
===================================================================
--- projects/bsd_rdma_4_9/sys/arm64/include/armreg.h	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/arm64/include/armreg.h	(revision 326162)
@@ -1,646 +1,647 @@
 /*-
  * Copyright (c) 2013, 2014 Andrew Turner
  * Copyright (c) 2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Andrew Turner under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_ARMREG_H_
 #define	_MACHINE_ARMREG_H_
 
 #define	INSN_SIZE		4
 
 #define	READ_SPECIALREG(reg)						\
 ({	uint64_t val;							\
 	__asm __volatile("mrs	%0, " __STRING(reg) : "=&r" (val));	\
 	val;								\
 })
 #define	WRITE_SPECIALREG(reg, val)					\
 	__asm __volatile("msr	" __STRING(reg) ", %0" : : "r"((uint64_t)val))
 
 /* CNTHCTL_EL2 - Counter-timer Hypervisor Control register */
 #define	CNTHCTL_EVNTI_MASK	(0xf << 4) /* Bit to trigger event stream */
 #define	CNTHCTL_EVNTDIR		(1 << 3) /* Control transition trigger bit */
 #define	CNTHCTL_EVNTEN		(1 << 2) /* Enable event stream */
 #define	CNTHCTL_EL1PCEN		(1 << 1) /* Allow EL0/1 physical timer access */
 #define	CNTHCTL_EL1PCTEN	(1 << 0) /*Allow EL0/1 physical counter access*/
 
 /* CPACR_EL1 */
 #define	CPACR_FPEN_MASK		(0x3 << 20)
 #define	 CPACR_FPEN_TRAP_ALL1	(0x0 << 20) /* Traps from EL0 and EL1 */
 #define	 CPACR_FPEN_TRAP_EL0	(0x1 << 20) /* Traps from EL0 */
 #define	 CPACR_FPEN_TRAP_ALL2	(0x2 << 20) /* Traps from EL0 and EL1 */
 #define	 CPACR_FPEN_TRAP_NONE	(0x3 << 20) /* No traps */
 #define	CPACR_TTA		(0x1 << 28)
 
 /* CTR_EL0 - Cache Type Register */
 #define	CTR_DLINE_SHIFT		16
 #define	CTR_DLINE_MASK		(0xf << CTR_DLINE_SHIFT)
 #define	CTR_DLINE_SIZE(reg)	(((reg) & CTR_DLINE_MASK) >> CTR_DLINE_SHIFT)
 #define	CTR_ILINE_SHIFT		0
 #define	CTR_ILINE_MASK		(0xf << CTR_ILINE_SHIFT)
 #define	CTR_ILINE_SIZE(reg)	(((reg) & CTR_ILINE_MASK) >> CTR_ILINE_SHIFT)
 
 /* DCZID_EL0 - Data Cache Zero ID register */
 #define DCZID_DZP		(1 << 4) /* DC ZVA prohibited if non-0 */
 #define DCZID_BS_SHIFT		0
 #define DCZID_BS_MASK		(0xf << DCZID_BS_SHIFT)
 #define	DCZID_BS_SIZE(reg)	(((reg) & DCZID_BS_MASK) >> DCZID_BS_SHIFT)
 
 /* ESR_ELx */
 #define	ESR_ELx_ISS_MASK	0x00ffffff
 #define	 ISS_INSN_FnV		(0x01 << 10)
 #define	 ISS_INSN_EA		(0x01 << 9)
 #define	 ISS_INSN_S1PTW		(0x01 << 7)
 #define	 ISS_INSN_IFSC_MASK	(0x1f << 0)
 #define	 ISS_DATA_ISV		(0x01 << 24)
 #define	 ISS_DATA_SAS_MASK	(0x03 << 22)
 #define	 ISS_DATA_SSE		(0x01 << 21)
 #define	 ISS_DATA_SRT_MASK	(0x1f << 16)
 #define	 ISS_DATA_SF		(0x01 << 15)
 #define	 ISS_DATA_AR		(0x01 << 14)
 #define	 ISS_DATA_FnV		(0x01 << 10)
 #define	 ISS_DATa_EA		(0x01 << 9)
 #define	 ISS_DATa_CM		(0x01 << 8)
 #define	 ISS_INSN_S1PTW		(0x01 << 7)
 #define	 ISS_DATa_WnR		(0x01 << 6)
 #define	 ISS_DATA_DFSC_MASK	(0x3f << 0)
 #define	 ISS_DATA_DFSC_ASF_L0	(0x00 << 0)
 #define	 ISS_DATA_DFSC_ASF_L1	(0x01 << 0)
 #define	 ISS_DATA_DFSC_ASF_L2	(0x02 << 0)
 #define	 ISS_DATA_DFSC_ASF_L3	(0x03 << 0)
 #define	 ISS_DATA_DFSC_TF_L0	(0x04 << 0)
 #define	 ISS_DATA_DFSC_TF_L1	(0x05 << 0)
 #define	 ISS_DATA_DFSC_TF_L2	(0x06 << 0)
 #define	 ISS_DATA_DFSC_TF_L3	(0x07 << 0)
 #define	 ISS_DATA_DFSC_AFF_L1	(0x09 << 0)
 #define	 ISS_DATA_DFSC_AFF_L2	(0x0a << 0)
 #define	 ISS_DATA_DFSC_AFF_L3	(0x0b << 0)
 #define	 ISS_DATA_DFSC_PF_L1	(0x0d << 0)
 #define	 ISS_DATA_DFSC_PF_L2	(0x0e << 0)
 #define	 ISS_DATA_DFSC_PF_L3	(0x0f << 0)
 #define	 ISS_DATA_DFSC_EXT	(0x10 << 0)
 #define	 ISS_DATA_DFSC_EXT_L0	(0x14 << 0)
 #define	 ISS_DATA_DFSC_EXT_L1	(0x15 << 0)
 #define	 ISS_DATA_DFSC_EXT_L2	(0x16 << 0)
 #define	 ISS_DATA_DFSC_EXT_L3	(0x17 << 0)
 #define	 ISS_DATA_DFSC_ECC	(0x18 << 0)
 #define	 ISS_DATA_DFSC_ECC_L0	(0x1c << 0)
 #define	 ISS_DATA_DFSC_ECC_L1	(0x1d << 0)
 #define	 ISS_DATA_DFSC_ECC_L2	(0x1e << 0)
 #define	 ISS_DATA_DFSC_ECC_L3	(0x1f << 0)
 #define	 ISS_DATA_DFSC_ALIGN	(0x21 << 0)
 #define	 ISS_DATA_DFSC_TLB_CONFLICT (0x30 << 0)
 #define	ESR_ELx_IL		(0x01 << 25)
 #define	ESR_ELx_EC_SHIFT	26
 #define	ESR_ELx_EC_MASK		(0x3f << 26)
 #define	ESR_ELx_EXCEPTION(esr)	(((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT)
 #define	 EXCP_UNKNOWN		0x00	/* Unkwn exception */
 #define	 EXCP_FP_SIMD		0x07	/* VFP/SIMD trap */
 #define	 EXCP_ILL_STATE		0x0e	/* Illegal execution state */
 #define	 EXCP_SVC		0x15	/* SVC trap */
 #define	 EXCP_MSR		0x18	/* MSR/MRS trap */
 #define	 EXCP_INSN_ABORT_L	0x20	/* Instruction abort, from lower EL */
 #define	 EXCP_INSN_ABORT	0x21	/* Instruction abort, from same EL */ 
 #define	 EXCP_PC_ALIGN		0x22	/* PC alignment fault */
 #define	 EXCP_DATA_ABORT_L	0x24	/* Data abort, from lower EL */
 #define	 EXCP_DATA_ABORT	0x25	/* Data abort, from same EL */ 
 #define	 EXCP_SP_ALIGN		0x26	/* SP slignment fault */
 #define	 EXCP_TRAP_FP		0x2c	/* Trapped FP exception */
 #define	 EXCP_SERROR		0x2f	/* SError interrupt */
 #define	 EXCP_SOFTSTP_EL0	0x32	/* Software Step, from lower EL */
 #define	 EXCP_SOFTSTP_EL1	0x33	/* Software Step, from same EL */
 #define	 EXCP_WATCHPT_EL1	0x35	/* Watchpoint, from same EL */
 #define	 EXCP_BRK		0x3c	/* Breakpoint */
 
 /* ICC_CTLR_EL1 */
 #define	ICC_CTLR_EL1_EOIMODE	(1U << 1)
 
 /* ICC_IAR1_EL1 */
 #define	ICC_IAR1_EL1_SPUR	(0x03ff)
 
 /* ICC_IGRPEN0_EL1 */
 #define	ICC_IGRPEN0_EL1_EN	(1U << 0)
 
 /* ICC_PMR_EL1 */
 #define	ICC_PMR_EL1_PRIO_MASK	(0xFFUL)
 
 /* ICC_SGI1R_EL1 */
 #define	ICC_SGI1R_EL1_TL_MASK		0xffffUL
 #define	ICC_SGI1R_EL1_AFF1_SHIFT	16
 #define	ICC_SGI1R_EL1_SGIID_SHIFT	24
 #define	ICC_SGI1R_EL1_AFF2_SHIFT	32
 #define	ICC_SGI1R_EL1_AFF3_SHIFT	48
 #define	ICC_SGI1R_EL1_SGIID_MASK	0xfUL
 #define	ICC_SGI1R_EL1_IRM		(0x1UL << 40)
 
 /* ICC_SRE_EL1 */
 #define	ICC_SRE_EL1_SRE		(1U << 0)
 
 /* ICC_SRE_EL2 */
 #define	ICC_SRE_EL2_SRE		(1U << 0)
 #define	ICC_SRE_EL2_EN		(1U << 3)
 
 /* ID_AA64DFR0_EL1 */
 #define	ID_AA64DFR0_MASK		0x0000000ff0f0fffful
 #define	ID_AA64DFR0_DEBUG_VER_SHIFT	0
 #define	ID_AA64DFR0_DEBUG_VER_MASK	(0xf << ID_AA64DFR0_DEBUG_VER_SHIFT)
 #define	ID_AA64DFR0_DEBUG_VER(x)	((x) & ID_AA64DFR0_DEBUG_VER_MASK)
 #define	 ID_AA64DFR0_DEBUG_VER_8	(0x6 << ID_AA64DFR0_DEBUG_VER_SHIFT)
 #define	 ID_AA64DFR0_DEBUG_VER_8_VHE	(0x7 << ID_AA64DFR0_DEBUG_VER_SHIFT)
 #define	 ID_AA64DFR0_DEBUG_VER_8_2	(0x8 << ID_AA64DFR0_DEBUG_VER_SHIFT)
 #define	ID_AA64DFR0_TRACE_VER_SHIFT	4
 #define	ID_AA64DFR0_TRACE_VER_MASK	(0xf << ID_AA64DFR0_TRACE_VER_SHIFT)
 #define	ID_AA64DFR0_TRACE_VER(x)	((x) & ID_AA64DFR0_TRACE_VER_MASK)
 #define	 ID_AA64DFR0_TRACE_VER_NONE	(0x0 << ID_AA64DFR0_TRACE_VER_SHIFT)
 #define	 ID_AA64DFR0_TRACE_VER_IMPL	(0x1 << ID_AA64DFR0_TRACE_VER_SHIFT)
 #define	ID_AA64DFR0_PMU_VER_SHIFT	8
 #define	ID_AA64DFR0_PMU_VER_MASK	(0xf << ID_AA64DFR0_PMU_VER_SHIFT)
 #define	ID_AA64DFR0_PMU_VER(x)		((x) & ID_AA64DFR0_PMU_VER_MASK)
 #define	 ID_AA64DFR0_PMU_VER_NONE	(0x0 << ID_AA64DFR0_PMU_VER_SHIFT)
 #define	 ID_AA64DFR0_PMU_VER_3		(0x1 << ID_AA64DFR0_PMU_VER_SHIFT)
 #define	 ID_AA64DFR0_PMU_VER_3_1	(0x4 << ID_AA64DFR0_PMU_VER_SHIFT)
 #define	 ID_AA64DFR0_PMU_VER_IMPL	(0xf << ID_AA64DFR0_PMU_VER_SHIFT)
 #define	ID_AA64DFR0_BRPS_SHIFT		12
 #define	ID_AA64DFR0_BRPS_MASK		(0xf << ID_AA64DFR0_BRPS_SHIFT)
 #define	ID_AA64DFR0_BRPS(x)		\
     ((((x) >> ID_AA64DFR0_BRPS_SHIFT) & 0xf) + 1)
 #define	ID_AA64DFR0_WRPS_SHIFT		20
 #define	ID_AA64DFR0_WRPS_MASK		(0xf << ID_AA64DFR0_WRPS_SHIFT)
 #define	ID_AA64DFR0_WRPS(x)		\
     ((((x) >> ID_AA64DFR0_WRPS_SHIFT) & 0xf) + 1)
 #define	ID_AA64DFR0_CTX_CMPS_SHIFT	28
 #define	ID_AA64DFR0_CTX_CMPS_MASK	(0xf << ID_AA64DFR0_CTX_CMPS_SHIFT)
 #define	ID_AA64DFR0_CTX_CMPS(x)		\
     ((((x) >> ID_AA64DFR0_CTX_CMPS_SHIFT) & 0xf) + 1)
 #define	ID_AA64DFR0_PMS_VER_SHIFT	32
 #define	ID_AA64DFR0_PMS_VER_MASK	(0xful << ID_AA64DFR0_PMS_VER_SHIFT)
 #define	ID_AA64DFR0_PMS_VER(x)	((x) & ID_AA64DFR0_PMS_VER_MASK)
 #define	 ID_AA64DFR0_PMS_VER_NONE	(0x0ul << ID_AA64DFR0_PMS_VER_SHIFT)
 #define	 ID_AA64DFR0_PMS_VER_V1		(0x1ul << ID_AA64DFR0_PMS_VER_SHIFT)
 
 /* ID_AA64ISAR0_EL1 */
 #define	ID_AA64ISAR0_MASK		0x0000fffff0fffff0ul
 #define	ID_AA64ISAR0_AES_SHIFT		4
 #define	ID_AA64ISAR0_AES_MASK		(0xf << ID_AA64ISAR0_AES_SHIFT)
 #define	ID_AA64ISAR0_AES(x)		((x) & ID_AA64ISAR0_AES_MASK)
 #define	 ID_AA64ISAR0_AES_NONE		(0x0 << ID_AA64ISAR0_AES_SHIFT)
 #define	 ID_AA64ISAR0_AES_BASE		(0x1 << ID_AA64ISAR0_AES_SHIFT)
 #define	 ID_AA64ISAR0_AES_PMULL		(0x2 << ID_AA64ISAR0_AES_SHIFT)
 #define	ID_AA64ISAR0_SHA1_SHIFT		8
 #define	ID_AA64ISAR0_SHA1_MASK		(0xf << ID_AA64ISAR0_SHA1_SHIFT)
 #define	ID_AA64ISAR0_SHA1(x)		((x) & ID_AA64ISAR0_SHA1_MASK)
 #define	 ID_AA64ISAR0_SHA1_NONE		(0x0 << ID_AA64ISAR0_SHA1_SHIFT)
 #define	 ID_AA64ISAR0_SHA1_BASE		(0x1 << ID_AA64ISAR0_SHA1_SHIFT)
 #define	ID_AA64ISAR0_SHA2_SHIFT		12
 #define	ID_AA64ISAR0_SHA2_MASK		(0xf << ID_AA64ISAR0_SHA2_SHIFT)
 #define	ID_AA64ISAR0_SHA2(x)		((x) & ID_AA64ISAR0_SHA2_MASK)
 #define	 ID_AA64ISAR0_SHA2_NONE		(0x0 << ID_AA64ISAR0_SHA2_SHIFT)
 #define	 ID_AA64ISAR0_SHA2_BASE		(0x1 << ID_AA64ISAR0_SHA2_SHIFT)
 #define	 ID_AA64ISAR0_SHA2_512		(0x2 << ID_AA64ISAR0_SHA2_SHIFT)
 #define	ID_AA64ISAR0_CRC32_SHIFT	16
 #define	ID_AA64ISAR0_CRC32_MASK		(0xf << ID_AA64ISAR0_CRC32_SHIFT)
 #define	ID_AA64ISAR0_CRC32(x)		((x) & ID_AA64ISAR0_CRC32_MASK)
 #define	 ID_AA64ISAR0_CRC32_NONE	(0x0 << ID_AA64ISAR0_CRC32_SHIFT)
 #define	 ID_AA64ISAR0_CRC32_BASE	(0x1 << ID_AA64ISAR0_CRC32_SHIFT)
 #define	ID_AA64ISAR0_ATOMIC_SHIFT	20
 #define	ID_AA64ISAR0_ATOMIC_MASK	(0xf << ID_AA64ISAR0_ATOMIC_SHIFT)
 #define	ID_AA64ISAR0_ATOMIC(x)		((x) & ID_AA64ISAR0_ATOMIC_MASK)
 #define	 ID_AA64ISAR0_ATOMIC_NONE	(0x0 << ID_AA64ISAR0_ATOMIC_SHIFT)
 #define	 ID_AA64ISAR0_ATOMIC_IMPL	(0x2 << ID_AA64ISAR0_ATOMIC_SHIFT)
 #define	ID_AA64ISAR0_RDM_SHIFT		28
 #define	ID_AA64ISAR0_RDM_MASK		(0xf << ID_AA64ISAR0_RDM_SHIFT)
 #define	ID_AA64ISAR0_RDM(x)		((x) & ID_AA64ISAR0_RDM_MASK)
 #define	 ID_AA64ISAR0_RDM_NONE		(0x0 << ID_AA64ISAR0_RDM_SHIFT)
 #define	 ID_AA64ISAR0_RDM_IMPL		(0x1 << ID_AA64ISAR0_RDM_SHIFT)
 #define	ID_AA64ISAR0_SHA3_SHIFT		32
 #define	ID_AA64ISAR0_SHA3_MASK		(0xful << ID_AA64ISAR0_SHA3_SHIFT)
 #define	ID_AA64ISAR0_SHA3(x)		((x) & ID_AA64ISAR0_SHA3_MASK)
 #define	 ID_AA64ISAR0_SHA3_NONE		(0x0ul << ID_AA64ISAR0_SHA3_SHIFT)
 #define	 ID_AA64ISAR0_SHA3_IMPL		(0x1ul << ID_AA64ISAR0_SHA3_SHIFT)
 #define	ID_AA64ISAR0_SM3_SHIFT		36
 #define	ID_AA64ISAR0_SM3_MASK		(0xful << ID_AA64ISAR0_SM3_SHIFT)
 #define	ID_AA64ISAR0_SM3(x)		((x) & ID_AA64ISAR0_SM3_MASK)
 #define	 ID_AA64ISAR0_SM3_NONE		(0x0ul << ID_AA64ISAR0_SM3_SHIFT)
 #define	 ID_AA64ISAR0_SM3_IMPL		(0x1ul << ID_AA64ISAR0_SM3_SHIFT)
 #define	ID_AA64ISAR0_SM4_SHIFT		40
 #define	ID_AA64ISAR0_SM4_MASK		(0xful << ID_AA64ISAR0_SM4_SHIFT)
 #define	ID_AA64ISAR0_SM4(x)		((x) & ID_AA64ISAR0_SM4_MASK)
 #define	 ID_AA64ISAR0_SM4_NONE		(0x0ul << ID_AA64ISAR0_SM4_SHIFT)
 #define	 ID_AA64ISAR0_SM4_IMPL		(0x1ul << ID_AA64ISAR0_SM4_SHIFT)
 #define	ID_AA64ISAR0_DP_SHIFT		48
 #define	ID_AA64ISAR0_DP_MASK		(0xful << ID_AA64ISAR0_DP_SHIFT)
 #define	ID_AA64ISAR0_DP(x)		((x) & ID_AA64ISAR0_DP_MASK)
 #define	 ID_AA64ISAR0_DP_NONE		(0x0ul << ID_AA64ISAR0_DP_SHIFT)
 #define	 ID_AA64ISAR0_DP_IMPL		(0x1ul << ID_AA64ISAR0_DP_SHIFT)
 
 /* ID_AA64ISAR1_EL1 */
 #define	ID_AA64ISAR1_MASK		0xffffffff
 #define	ID_AA64ISAR1_DPB_SHIFT		0
 #define	ID_AA64ISAR1_DPB_MASK		(0xf << ID_AA64ISAR1_DPB_SHIFT)
 #define	ID_AA64ISAR1_DPB(x)		((x) & ID_AA64ISAR1_DPB_MASK)
 #define	 ID_AA64ISAR1_DPB_NONE		(0x0 << ID_AA64ISAR1_DPB_SHIFT)
 #define	 ID_AA64ISAR1_DPB_IMPL		(0x1 << ID_AA64ISAR1_DPB_SHIFT)
 #define	ID_AA64ISAR1_APA_SHIFT		4
 #define	ID_AA64ISAR1_APA_MASK		(0xf << ID_AA64ISAR1_APA_SHIFT)
 #define	ID_AA64ISAR1_APA(x)		((x) & ID_AA64ISAR1_APA_MASK)
 #define	 ID_AA64ISAR1_APA_NONE		(0x0 << ID_AA64ISAR1_APA_SHIFT)
 #define	 ID_AA64ISAR1_APA_IMPL		(0x1 << ID_AA64ISAR1_APA_SHIFT)
 #define	ID_AA64ISAR1_API_SHIFT		8
 #define	ID_AA64ISAR1_API_MASK		(0xf << ID_AA64ISAR1_API_SHIFT)
 #define	ID_AA64ISAR1_API(x)		((x) & ID_AA64ISAR1_API_MASK)
 #define	 ID_AA64ISAR1_API_NONE		(0x0 << ID_AA64ISAR1_API_SHIFT)
 #define	 ID_AA64ISAR1_API_IMPL		(0x1 << ID_AA64ISAR1_API_SHIFT)
 #define	ID_AA64ISAR1_JSCVT_SHIFT	12
 #define	ID_AA64ISAR1_JSCVT_MASK		(0xf << ID_AA64ISAR1_JSCVT_SHIFT)
 #define	ID_AA64ISAR1_JSCVT(x)		((x) & ID_AA64ISAR1_JSCVT_MASK)
 #define	 ID_AA64ISAR1_JSCVT_NONE	(0x0 << ID_AA64ISAR1_JSCVT_SHIFT)
 #define	 ID_AA64ISAR1_JSCVT_IMPL	(0x1 << ID_AA64ISAR1_JSCVT_SHIFT)
 #define	ID_AA64ISAR1_FCMA_SHIFT		16
 #define	ID_AA64ISAR1_FCMA_MASK		(0xf << ID_AA64ISAR1_FCMA_SHIFT)
 #define	ID_AA64ISAR1_FCMA(x)		((x) & ID_AA64ISAR1_FCMA_MASK)
 #define	 ID_AA64ISAR1_FCMA_NONE		(0x0 << ID_AA64ISAR1_FCMA_SHIFT)
 #define	 ID_AA64ISAR1_FCMA_IMPL		(0x1 << ID_AA64ISAR1_FCMA_SHIFT)
 #define	ID_AA64ISAR1_LRCPC_SHIFT	20
 #define	ID_AA64ISAR1_LRCPC_MASK		(0xf << ID_AA64ISAR1_LRCPC_SHIFT)
 #define	ID_AA64ISAR1_LRCPC(x)		((x) & ID_AA64ISAR1_LRCPC_MASK)
 #define	 ID_AA64ISAR1_LRCPC_NONE	(0x0 << ID_AA64ISAR1_LRCPC_SHIFT)
 #define	 ID_AA64ISAR1_LRCPC_IMPL	(0x1 << ID_AA64ISAR1_LRCPC_SHIFT)
 #define	ID_AA64ISAR1_GPA_SHIFT		24
 #define	ID_AA64ISAR1_GPA_MASK		(0xf << ID_AA64ISAR1_GPA_SHIFT)
 #define	ID_AA64ISAR1_GPA(x)		((x) & ID_AA64ISAR1_GPA_MASK)
 #define	 ID_AA64ISAR1_GPA_NONE		(0x0 << ID_AA64ISAR1_GPA_SHIFT)
 #define	 ID_AA64ISAR1_GPA_IMPL		(0x1 << ID_AA64ISAR1_GPA_SHIFT)
 #define	ID_AA64ISAR1_GPI_SHIFT		28
 #define	ID_AA64ISAR1_GPI_MASK		(0xf << ID_AA64ISAR1_GPI_SHIFT)
 #define	ID_AA64ISAR1_GPI(x)		((x) & ID_AA64ISAR1_GPI_MASK)
 #define	 ID_AA64ISAR1_GPI_NONE		(0x0 << ID_AA64ISAR1_GPI_SHIFT)
 #define	 ID_AA64ISAR1_GPI_IMPL		(0x1 << ID_AA64ISAR1_GPI_SHIFT)
 
 /* ID_AA64MMFR0_EL1 */
 #define	ID_AA64MMFR0_MASK		0xffffffff
 #define	ID_AA64MMFR0_PA_RANGE_SHIFT	0
 #define	ID_AA64MMFR0_PA_RANGE_MASK	(0xf << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	ID_AA64MMFR0_PA_RANGE(x)	((x) & ID_AA64MMFR0_PA_RANGE_MASK)
 #define	 ID_AA64MMFR0_PA_RANGE_4G	(0x0 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	 ID_AA64MMFR0_PA_RANGE_64G	(0x1 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	 ID_AA64MMFR0_PA_RANGE_1T	(0x2 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	 ID_AA64MMFR0_PA_RANGE_4T	(0x3 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	 ID_AA64MMFR0_PA_RANGE_16T	(0x4 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	 ID_AA64MMFR0_PA_RANGE_256T	(0x5 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	 ID_AA64MMFR0_PA_RANGE_4P	(0x6 << ID_AA64MMFR0_PA_RANGE_SHIFT)
 #define	ID_AA64MMFR0_ASID_BITS_SHIFT	4
 #define	ID_AA64MMFR0_ASID_BITS_MASK	(0xf << ID_AA64MMFR0_ASID_BITS_SHIFT)
 #define	ID_AA64MMFR0_ASID_BITS(x)	((x) & ID_AA64MMFR0_ASID_BITS_MASK)
 #define	 ID_AA64MMFR0_ASID_BITS_8	(0x0 << ID_AA64MMFR0_ASID_BITS_SHIFT)
 #define	 ID_AA64MMFR0_ASID_BITS_16	(0x2 << ID_AA64MMFR0_ASID_BITS_SHIFT)
 #define	ID_AA64MMFR0_BIGEND_SHIFT	8
 #define	ID_AA64MMFR0_BIGEND_MASK	(0xf << ID_AA64MMFR0_BIGEND_SHIFT)
 #define	ID_AA64MMFR0_BIGEND(x)		((x) & ID_AA64MMFR0_BIGEND_MASK)
 #define	 ID_AA64MMFR0_BIGEND_FIXED	(0x0 << ID_AA64MMFR0_BIGEND_SHIFT)
 #define	 ID_AA64MMFR0_BIGEND_MIXED	(0x1 << ID_AA64MMFR0_BIGEND_SHIFT)
 #define	ID_AA64MMFR0_S_NS_MEM_SHIFT	12
 #define	ID_AA64MMFR0_S_NS_MEM_MASK	(0xf << ID_AA64MMFR0_S_NS_MEM_SHIFT)
 #define	ID_AA64MMFR0_S_NS_MEM(x)	((x) & ID_AA64MMFR0_S_NS_MEM_MASK)
 #define	 ID_AA64MMFR0_S_NS_MEM_NONE	(0x0 << ID_AA64MMFR0_S_NS_MEM_SHIFT)
 #define	 ID_AA64MMFR0_S_NS_MEM_DISTINCT	(0x1 << ID_AA64MMFR0_S_NS_MEM_SHIFT)
 #define	ID_AA64MMFR0_BIGEND_EL0_SHIFT	16
 #define	ID_AA64MMFR0_BIGEND_EL0_MASK	(0xf << ID_AA64MMFR0_BIGEND_EL0_SHIFT)
 #define	ID_AA64MMFR0_BIGEND_EL0(x)	((x) & ID_AA64MMFR0_BIGEND_EL0_MASK)
 #define	 ID_AA64MMFR0_BIGEND_EL0_FIXED	(0x0 << ID_AA64MMFR0_BIGEND_EL0_SHIFT)
 #define	 ID_AA64MMFR0_BIGEND_EL0_MIXED	(0x1 << ID_AA64MMFR0_BIGEND_EL0_SHIFT)
 #define	ID_AA64MMFR0_TGRAN16_SHIFT	20
 #define	ID_AA64MMFR0_TGRAN16_MASK	(0xf << ID_AA64MMFR0_TGRAN16_SHIFT)
 #define	ID_AA64MMFR0_TGRAN16(x)		((x) & ID_AA64MMFR0_TGRAN16_MASK)
 #define	 ID_AA64MMFR0_TGRAN16_NONE	(0x0 << ID_AA64MMFR0_TGRAN16_SHIFT)
 #define	 ID_AA64MMFR0_TGRAN16_IMPL	(0x1 << ID_AA64MMFR0_TGRAN16_SHIFT)
 #define	ID_AA64MMFR0_TGRAN64_SHIFT	24
 #define	ID_AA64MMFR0_TGRAN64_MASK	(0xf << ID_AA64MMFR0_TGRAN64_SHIFT)
 #define	ID_AA64MMFR0_TGRAN64(x)		((x) & ID_AA64MMFR0_TGRAN64_MASK)
 #define	 ID_AA64MMFR0_TGRAN64_IMPL	(0x0 << ID_AA64MMFR0_TGRAN64_SHIFT)
 #define	 ID_AA64MMFR0_TGRAN64_NONE	(0xf << ID_AA64MMFR0_TGRAN64_SHIFT)
 #define	ID_AA64MMFR0_TGRAN4_SHIFT	28
 #define	ID_AA64MMFR0_TGRAN4_MASK	(0xf << ID_AA64MMFR0_TGRAN4_SHIFT)
 #define	ID_AA64MMFR0_TGRAN4(x)		((x) & ID_AA64MMFR0_TGRAN4_MASK)
 #define	 ID_AA64MMFR0_TGRAN4_IMPL	(0x0 << ID_AA64MMFR0_TGRAN4_SHIFT)
 #define	 ID_AA64MMFR0_TGRAN4_NONE	(0xf << ID_AA64MMFR0_TGRAN4_SHIFT)
 
 /* ID_AA64MMFR1_EL1 */
 #define	ID_AA64MMFR1_MASK		0xffffffff
 #define	ID_AA64MMFR1_HAFDBS_SHIFT	0
 #define	ID_AA64MMFR1_HAFDBS_MASK	(0xf << ID_AA64MMFR1_HAFDBS_SHIFT)
 #define	ID_AA64MMFR1_HAFDBS(x)		((x) & ID_AA64MMFR1_HAFDBS_MASK)
 #define	 ID_AA64MMFR1_HAFDBS_NONE	(0x0 << ID_AA64MMFR1_HAFDBS_SHIFT)
 #define	 ID_AA64MMFR1_HAFDBS_AF		(0x1 << ID_AA64MMFR1_HAFDBS_SHIFT)
 #define	 ID_AA64MMFR1_HAFDBS_AF_DBS	(0x2 << ID_AA64MMFR1_HAFDBS_SHIFT)
 #define	ID_AA64MMFR1_VMIDBITS_SHIFT	4
 #define	ID_AA64MMFR1_VMIDBITS_MASK	(0xf << ID_AA64MMFR1_VMIDBITS_SHIFT)
 #define	ID_AA64MMFR1_VMIDBITS(x)	((x) & ID_AA64MMFR1_VMIDBITS_MASK)
 #define	 ID_AA64MMFR1_VMIDBITS_8	(0x0 << ID_AA64MMFR1_VMIDBITS_SHIFT)
 #define	 ID_AA64MMFR1_VMIDBITS_16	(0x2 << ID_AA64MMFR1_VMIDBITS_SHIFT)
 #define	ID_AA64MMFR1_VH_SHIFT		8
 #define	ID_AA64MMFR1_VH_MASK		(0xf << ID_AA64MMFR1_VH_SHIFT)
 #define	ID_AA64MMFR1_VH(x)		((x) & ID_AA64MMFR1_VH_MASK)
 #define	 ID_AA64MMFR1_VH_NONE		(0x0 << ID_AA64MMFR1_VH_SHIFT)
 #define	 ID_AA64MMFR1_VH_IMPL		(0x1 << ID_AA64MMFR1_VH_SHIFT)
 #define	ID_AA64MMFR1_HPDS_SHIFT		12
 #define	ID_AA64MMFR1_HPDS_MASK		(0xf << ID_AA64MMFR1_HPDS_SHIFT)
 #define	ID_AA64MMFR1_HPDS(x)		((x) & ID_AA64MMFR1_HPDS_MASK)
 #define	 ID_AA64MMFR1_HPDS_NONE		(0x0 << ID_AA64MMFR1_HPDS_SHIFT)
 #define	 ID_AA64MMFR1_HPDS_HPD		(0x1 << ID_AA64MMFR1_HPDS_SHIFT)
 #define	 ID_AA64MMFR1_HPDS_TTPBHA	(0x2 << ID_AA64MMFR1_HPDS_SHIFT)
 #define	ID_AA64MMFR1_LO_SHIFT		16
 #define	ID_AA64MMFR1_LO_MASK		(0xf << ID_AA64MMFR1_LO_SHIFT)
 #define	ID_AA64MMFR1_LO(x)		((x) & ID_AA64MMFR1_LO_MASK)
 #define	 ID_AA64MMFR1_LO_NONE		(0x0 << ID_AA64MMFR1_LO_SHIFT)
 #define	 ID_AA64MMFR1_LO_IMPL		(0x1 << ID_AA64MMFR1_LO_SHIFT)
 #define	ID_AA64MMFR1_PAN_SHIFT		20
 #define	ID_AA64MMFR1_PAN_MASK		(0xf << ID_AA64MMFR1_PAN_SHIFT)
 #define	ID_AA64MMFR1_PAN(x)		((x) & ID_AA64MMFR1_PAN_MASK)
 #define	 ID_AA64MMFR1_PAN_NONE		(0x0 << ID_AA64MMFR1_PAN_SHIFT)
 #define	 ID_AA64MMFR1_PAN_IMPL		(0x1 << ID_AA64MMFR1_PAN_SHIFT)
 #define	 ID_AA64MMFR1_PAN_ATS1E1	(0x2 << ID_AA64MMFR1_PAN_SHIFT)
 #define	ID_AA64MMFR1_SPEC_SEI_SHIFT	24
 #define	ID_AA64MMFR1_SPEC_SEI_MASK	(0xf << ID_AA64MMFR1_SPEC_SEI_SHIFT)
 #define	ID_AA64MMFR1_SPEC_SEI(x)	((x) & ID_AA64MMFR1_SPEC_SEI_MASK)
 #define	 ID_AA64MMFR1_SPEC_SEI_NONE	(0x0 << ID_AA64MMFR1_SPEC_SEI_SHIFT)
 #define	 ID_AA64MMFR1_SPEC_SEI_IMPL	(0x1 << ID_AA64MMFR1_SPEC_SEI_SHIFT)
 #define	ID_AA64MMFR1_XNX_SHIFT		28
 #define	ID_AA64MMFR1_XNX_MASK		(0xf << ID_AA64MMFR1_XNX_SHIFT)
 #define	ID_AA64MMFR1_XNX(x)		((x) & ID_AA64MMFR1_XNX_MASK)
 #define	 ID_AA64MMFR1_XNX_NONE		(0x0 << ID_AA64MMFR1_XNX_SHIFT)
 #define	 ID_AA64MMFR1_XNX_IMPL		(0x1 << ID_AA64MMFR1_XNX_SHIFT)
 
 /* ID_AA64MMFR2_EL1 */
 #define	ID_AA64MMFR2_EL1		S3_0_C0_C7_2
 #define	ID_AA64MMFR2_MASK		0x0fffffff
 #define	ID_AA64MMFR2_CNP_SHIFT		0
 #define	ID_AA64MMFR2_CNP_MASK		(0xf << ID_AA64MMFR2_CNP_SHIFT)
 #define	ID_AA64MMFR2_CNP(x)		((x) & ID_AA64MMFR2_CNP_MASK)
 #define	 ID_AA64MMFR2_CNP_NONE		(0x0 << ID_AA64MMFR2_CNP_SHIFT)
 #define	 ID_AA64MMFR2_CNP_IMPL		(0x1 << ID_AA64MMFR2_CNP_SHIFT)
 #define	ID_AA64MMFR2_UAO_SHIFT		4
 #define	ID_AA64MMFR2_UAO_MASK		(0xf << ID_AA64MMFR2_UAO_SHIFT)
 #define	ID_AA64MMFR2_UAO(x)		((x) & ID_AA64MMFR2_UAO_MASK)
 #define	 ID_AA64MMFR2_UAO_NONE		(0x0 << ID_AA64MMFR2_UAO_SHIFT)
 #define	 ID_AA64MMFR2_UAO_IMPL		(0x1 << ID_AA64MMFR2_UAO_SHIFT)
 #define	ID_AA64MMFR2_LSM_SHIFT		8
 #define	ID_AA64MMFR2_LSM_MASK		(0xf << ID_AA64MMFR2_LSM_SHIFT)
 #define	ID_AA64MMFR2_LSM(x)		((x) & ID_AA64MMFR2_LSM_MASK)
 #define	 ID_AA64MMFR2_LSM_NONE		(0x0 << ID_AA64MMFR2_LSM_SHIFT)
 #define	 ID_AA64MMFR2_LSM_IMPL		(0x1 << ID_AA64MMFR2_LSM_SHIFT)
 #define	ID_AA64MMFR2_IESB_SHIFT		12
 #define	ID_AA64MMFR2_IESB_MASK		(0xf << ID_AA64MMFR2_IESB_SHIFT)
 #define	ID_AA64MMFR2_IESB(x)		((x) & ID_AA64MMFR2_IESB_MASK)
 #define	 ID_AA64MMFR2_IESB_NONE		(0x0 << ID_AA64MMFR2_IESB_SHIFT)
 #define	 ID_AA64MMFR2_IESB_IMPL		(0x1 << ID_AA64MMFR2_IESB_SHIFT)
 #define	ID_AA64MMFR2_VA_RANGE_SHIFT	16
 #define	ID_AA64MMFR2_VA_RANGE_MASK	(0xf << ID_AA64MMFR2_VA_RANGE_SHIFT)
 #define	ID_AA64MMFR2_VA_RANGE(x)	((x) & ID_AA64MMFR2_VA_RANGE_MASK)
 #define	 ID_AA64MMFR2_VA_RANGE_48	(0x0 << ID_AA64MMFR2_VA_RANGE_SHIFT)
 #define	 ID_AA64MMFR2_VA_RANGE_52	(0x1 << ID_AA64MMFR2_VA_RANGE_SHIFT)
 #define	ID_AA64MMFR2_CCIDX_SHIFT	20
 #define	ID_AA64MMFR2_CCIDX_MASK		(0xf << ID_AA64MMFR2_CCIDX_SHIFT)
 #define	ID_AA64MMFR2_CCIDX(x)		((x) & ID_AA64MMFR2_CCIDX_MASK)
 #define	 ID_AA64MMFR2_CCIDX_32		(0x0 << ID_AA64MMFR2_CCIDX_SHIFT)
 #define	 ID_AA64MMFR2_CCIDX_64		(0x1 << ID_AA64MMFR2_CCIDX_SHIFT)
 #define	ID_AA64MMFR2_NV_SHIFT		24
 #define	ID_AA64MMFR2_NV_MASK		(0xf << ID_AA64MMFR2_NV_SHIFT)
 #define	ID_AA64MMFR2_NV(x)		((x) & ID_AA64MMFR2_NV_MASK)
 #define	 ID_AA64MMFR2_NV_NONE		(0x0 << ID_AA64MMFR2_NV_SHIFT)
 #define	 ID_AA64MMFR2_NV_IMPL		(0x1 << ID_AA64MMFR2_NV_SHIFT)
 
 /* ID_AA64PFR0_EL1 */
 #define	ID_AA64PFR0_MASK		0x0000000ffffffffful
 #define	ID_AA64PFR0_EL0_SHIFT		0
 #define	ID_AA64PFR0_EL0_MASK		(0xf << ID_AA64PFR0_EL0_SHIFT)
 #define	ID_AA64PFR0_EL0(x)		((x) & ID_AA64PFR0_EL0_MASK)
 #define	 ID_AA64PFR0_EL0_64		(1 << ID_AA64PFR0_EL0_SHIFT)
 #define	 ID_AA64PFR0_EL0_64_32		(2 << ID_AA64PFR0_EL0_SHIFT)
 #define	ID_AA64PFR0_EL1_SHIFT		4
 #define	ID_AA64PFR0_EL1_MASK		(0xf << ID_AA64PFR0_EL1_SHIFT)
 #define	ID_AA64PFR0_EL1(x)		((x) & ID_AA64PFR0_EL1_MASK)
 #define	 ID_AA64PFR0_EL1_64		(1 << ID_AA64PFR0_EL1_SHIFT)
 #define	 ID_AA64PFR0_EL1_64_32		(2 << ID_AA64PFR0_EL1_SHIFT)
 #define	ID_AA64PFR0_EL2_SHIFT		8
 #define	ID_AA64PFR0_EL2_MASK		(0xf << ID_AA64PFR0_EL2_SHIFT)
 #define	ID_AA64PFR0_EL2(x)		((x) & ID_AA64PFR0_EL2_MASK)
 #define	 ID_AA64PFR0_EL2_NONE		(0 << ID_AA64PFR0_EL2_SHIFT)
 #define	 ID_AA64PFR0_EL2_64		(1 << ID_AA64PFR0_EL2_SHIFT)
 #define	 ID_AA64PFR0_EL2_64_32		(2 << ID_AA64PFR0_EL2_SHIFT)
 #define	ID_AA64PFR0_EL3_SHIFT		12
 #define	ID_AA64PFR0_EL3_MASK		(0xf << ID_AA64PFR0_EL3_SHIFT)
 #define	ID_AA64PFR0_EL3(x)		((x) & ID_AA64PFR0_EL3_MASK)
 #define	 ID_AA64PFR0_EL3_NONE		(0 << ID_AA64PFR0_EL3_SHIFT)
 #define	 ID_AA64PFR0_EL3_64		(1 << ID_AA64PFR0_EL3_SHIFT)
 #define	 ID_AA64PFR0_EL3_64_32		(2 << ID_AA64PFR0_EL3_SHIFT)
 #define	ID_AA64PFR0_FP_SHIFT		16
 #define	ID_AA64PFR0_FP_MASK		(0xf << ID_AA64PFR0_FP_SHIFT)
 #define	ID_AA64PFR0_FP(x)		((x) & ID_AA64PFR0_FP_MASK)
 #define	 ID_AA64PFR0_FP_IMPL		(0x0 << ID_AA64PFR0_FP_SHIFT)
 #define	 ID_AA64PFR0_FP_HP		(0x1 << ID_AA64PFR0_FP_SHIFT)
 #define	 ID_AA64PFR0_FP_NONE		(0xf << ID_AA64PFR0_FP_SHIFT)
 #define	ID_AA64PFR0_ADV_SIMD_SHIFT	20
 #define	ID_AA64PFR0_ADV_SIMD_MASK	(0xf << ID_AA64PFR0_ADV_SIMD_SHIFT)
 #define	ID_AA64PFR0_ADV_SIMD(x)		((x) & ID_AA64PFR0_ADV_SIMD_MASK)
 #define	 ID_AA64PFR0_ADV_SIMD_IMPL	(0x0 << ID_AA64PFR0_ADV_SIMD_SHIFT)
 #define	 ID_AA64PFR0_ADV_SIMD_HP	(0x1 << ID_AA64PFR0_ADV_SIMD_SHIFT)
 #define	 ID_AA64PFR0_ADV_SIMD_NONE	(0xf << ID_AA64PFR0_ADV_SIMD_SHIFT)
 #define	ID_AA64PFR0_GIC_BITS		0x4 /* Number of bits in GIC field */
 #define	ID_AA64PFR0_GIC_SHIFT		24
 #define	ID_AA64PFR0_GIC_MASK		(0xf << ID_AA64PFR0_GIC_SHIFT)
 #define	ID_AA64PFR0_GIC(x)		((x) & ID_AA64PFR0_GIC_MASK)
 #define	 ID_AA64PFR0_GIC_CPUIF_NONE	(0x0 << ID_AA64PFR0_GIC_SHIFT)
 #define	 ID_AA64PFR0_GIC_CPUIF_EN	(0x1 << ID_AA64PFR0_GIC_SHIFT)
 #define	ID_AA64PFR0_RAS_SHIFT		28
 #define	ID_AA64PFR0_RAS_MASK		(0xf << ID_AA64PFR0_RAS_SHIFT)
 #define	ID_AA64PFR0_RAS(x)		((x) & ID_AA64PFR0_RAS_MASK)
 #define	 ID_AA64PFR0_RAS_NONE		(0x0 << ID_AA64PFR0_RAS_SHIFT)
 #define	 ID_AA64PFR0_RAS_V1		(0x1 << ID_AA64PFR0_RAS_SHIFT)
 #define	ID_AA64PFR0_SVE_SHIFT		32
 #define	ID_AA64PFR0_SVE_MASK		(0xful << ID_AA64PFR0_SVE_SHIFT)
 #define	ID_AA64PFR0_SVE(x)		((x) & ID_AA64PFR0_SVE_MASK)
 #define	 ID_AA64PFR0_SVE_NONE		(0x0ul << ID_AA64PFR0_SVE_SHIFT)
 #define	 ID_AA64PFR0_SVE_IMPL		(0x1ul << ID_AA64PFR0_SVE_SHIFT)
 
 /* MAIR_EL1 - Memory Attribute Indirection Register */
 #define	MAIR_ATTR_MASK(idx)	(0xff << ((n)* 8))
 #define	MAIR_ATTR(attr, idx) ((attr) << ((idx) * 8))
 #define	 MAIR_DEVICE_nGnRnE	0x00
 #define	 MAIR_NORMAL_NC		0x44
 #define	 MAIR_NORMAL_WT		0xbb
 #define	 MAIR_NORMAL_WB		0xff
 
 /* PAR_EL1 - Physical Address Register */
 #define	PAR_F_SHIFT		0
 #define	PAR_F			(0x1 << PAR_F_SHIFT)
 #define	PAR_SUCCESS(x)		(((x) & PAR_F) == 0)
 /* When PAR_F == 0 (success) */
 #define	PAR_SH_SHIFT		7
 #define	PAR_SH_MASK		(0x3 << PAR_SH_SHIFT)
 #define	PAR_NS_SHIFT		9
 #define	PAR_NS_MASK		(0x3 << PAR_NS_SHIFT)
 #define	PAR_PA_SHIFT		12
 #define	PAR_PA_MASK		0x0000fffffffff000
 #define	PAR_ATTR_SHIFT		56
 #define	PAR_ATTR_MASK		(0xff << PAR_ATTR_SHIFT)
 /* When PAR_F == 1 (aborted) */
 #define	PAR_FST_SHIFT		1
 #define	PAR_FST_MASK		(0x3f << PAR_FST_SHIFT)
 #define	PAR_PTW_SHIFT		8
 #define	PAR_PTW_MASK		(0x1 << PAR_PTW_SHIFT)
 #define	PAR_S_SHIFT		9
 #define	PAR_S_MASK		(0x1 << PAR_S_SHIFT)
 
 /* SCTLR_EL1 - System Control Register */
 #define	SCTLR_RES0	0xc8222400	/* Reserved ARMv8.0, write 0 */
 #define	SCTLR_RES1	0x30d00800	/* Reserved ARMv8.0, write 1 */
 
 #define	SCTLR_M		0x00000001
 #define	SCTLR_A		0x00000002
 #define	SCTLR_C		0x00000004
 #define	SCTLR_SA	0x00000008
 #define	SCTLR_SA0	0x00000010
 #define	SCTLR_CP15BEN	0x00000020
 #define	SCTLR_THEE	0x00000040
 #define	SCTLR_ITD	0x00000080
 #define	SCTLR_SED	0x00000100
 #define	SCTLR_UMA	0x00000200
 #define	SCTLR_I		0x00001000
 #define	SCTLR_DZE	0x00004000
 #define	SCTLR_UCT	0x00008000
 #define	SCTLR_nTWI	0x00010000
 #define	SCTLR_nTWE	0x00040000
 #define	SCTLR_WXN	0x00080000
 #define	SCTLR_IESB	0x00200000
 #define	SCTLR_SPAN	0x00800000
 #define	SCTLR_EOE	0x01000000
 #define	SCTLR_EE	0x02000000
 #define	SCTLR_UCI	0x04000000
 #define	SCTLR_nTLSMD	0x10000000
 #define	SCTLR_LSMAOE	0x20000000
 
 /* SPSR_EL1 */
 /*
  * When the exception is taken in AArch64:
  * M[4]   is 0 for AArch64 mode
  * M[3:2] is the exception level
  * M[1]   is unused
  * M[0]   is the SP select:
  *         0: always SP0
  *         1: current ELs SP
  */
 #define	PSR_M_EL0t	0x00000000
 #define	PSR_M_EL1t	0x00000004
 #define	PSR_M_EL1h	0x00000005
 #define	PSR_M_EL2t	0x00000008
 #define	PSR_M_EL2h	0x00000009
 #define	PSR_M_MASK	0x0000001f
 
 #define	PSR_F		0x00000040
 #define	PSR_I		0x00000080
 #define	PSR_A		0x00000100
 #define	PSR_D		0x00000200
 #define	PSR_IL		0x00100000
 #define	PSR_SS		0x00200000
 #define	PSR_V		0x10000000
 #define	PSR_C		0x20000000
 #define	PSR_Z		0x40000000
 #define	PSR_N		0x80000000
+#define	PSR_FLAGS	0xf0000000
 
 /* TCR_EL1 - Translation Control Register */
 #define	TCR_ASID_16	(1 << 36)
 
 #define	TCR_IPS_SHIFT	32
 #define	TCR_IPS_32BIT	(0 << TCR_IPS_SHIFT)
 #define	TCR_IPS_36BIT	(1 << TCR_IPS_SHIFT)
 #define	TCR_IPS_40BIT	(2 << TCR_IPS_SHIFT)
 #define	TCR_IPS_42BIT	(3 << TCR_IPS_SHIFT)
 #define	TCR_IPS_44BIT	(4 << TCR_IPS_SHIFT)
 #define	TCR_IPS_48BIT	(5 << TCR_IPS_SHIFT)
 
 #define	TCR_TG1_SHIFT	30
 #define	TCR_TG1_16K	(1 << TCR_TG1_SHIFT)
 #define	TCR_TG1_4K	(2 << TCR_TG1_SHIFT)
 #define	TCR_TG1_64K	(3 << TCR_TG1_SHIFT)
 
 #define	TCR_SH1_SHIFT	28
 #define	TCR_SH1_IS	(0x3UL << TCR_SH1_SHIFT)
 #define	TCR_ORGN1_SHIFT	26
 #define	TCR_ORGN1_WBWA	(0x1UL << TCR_ORGN1_SHIFT)
 #define	TCR_IRGN1_SHIFT	24
 #define	TCR_IRGN1_WBWA	(0x1UL << TCR_IRGN1_SHIFT)
 #define	TCR_SH0_SHIFT	12
 #define	TCR_SH0_IS	(0x3UL << TCR_SH0_SHIFT)
 #define	TCR_ORGN0_SHIFT	10
 #define	TCR_ORGN0_WBWA	(0x1UL << TCR_ORGN0_SHIFT)
 #define	TCR_IRGN0_SHIFT	8
 #define	TCR_IRGN0_WBWA	(0x1UL << TCR_IRGN0_SHIFT)
 
 #define	TCR_CACHE_ATTRS	((TCR_IRGN0_WBWA | TCR_IRGN1_WBWA) |\
 				(TCR_ORGN0_WBWA | TCR_ORGN1_WBWA))
 
 #ifdef SMP
 #define	TCR_SMP_ATTRS	(TCR_SH0_IS | TCR_SH1_IS)
 #else
 #define	TCR_SMP_ATTRS	0
 #endif
 
 #define	TCR_T1SZ_SHIFT	16
 #define	TCR_T0SZ_SHIFT	0
 #define	TCR_T1SZ(x)	((x) << TCR_T1SZ_SHIFT)
 #define	TCR_T0SZ(x)	((x) << TCR_T0SZ_SHIFT)
 #define	TCR_TxSZ(x)	(TCR_T1SZ(x) | TCR_T0SZ(x))
 
 /* Saved Program Status Register */
 #define	DBG_SPSR_SS	(0x1 << 21)
 
 /* Monitor Debug System Control Register */
 #define	DBG_MDSCR_SS	(0x1 << 0)
 #define	DBG_MDSCR_KDE	(0x1 << 13)
 #define	DBG_MDSCR_MDE	(0x1 << 15)
 
 /* Perfomance Monitoring Counters */
 #define	PMCR_E		(1 << 0) /* Enable all counters */
 #define	PMCR_P		(1 << 1) /* Reset all counters */
 #define	PMCR_C		(1 << 2) /* Clock counter reset */
 #define	PMCR_D		(1 << 3) /* CNTR counts every 64 clk cycles */
 #define	PMCR_X		(1 << 4) /* Export to ext. monitoring (ETM) */
 #define	PMCR_DP		(1 << 5) /* Disable CCNT if non-invasive debug*/
 #define	PMCR_LC		(1 << 6) /* Long cycle count enable */
 #define	PMCR_IMP_SHIFT	24 /* Implementer code */
 #define	PMCR_IMP_MASK	(0xff << PMCR_IMP_SHIFT)
 #define	PMCR_IDCODE_SHIFT	16 /* Identification code */
 #define	PMCR_IDCODE_MASK	(0xff << PMCR_IDCODE_SHIFT)
 #define	 PMCR_IDCODE_CORTEX_A57	0x01
 #define	 PMCR_IDCODE_CORTEX_A72	0x02
 #define	 PMCR_IDCODE_CORTEX_A53	0x03
 #define	PMCR_N_SHIFT	11       /* Number of counters implemented */
 #define	PMCR_N_MASK	(0x1f << PMCR_N_SHIFT)
 
 #endif /* !_MACHINE_ARMREG_H_ */
Index: projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
===================================================================
--- projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c	(revision 326162)
@@ -1,2697 +1,2691 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  *
  * Portions Copyright 2010 The FreeBSD Foundation
  *
  * $FreeBSD$
  */
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/atomic.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #ifdef illumos
 #include <sys/ddi.h>
 #endif
 #include <sys/sunddi.h>
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
 #ifdef illumos
 #include <sys/strsubr.h>
 #endif
 #include <sys/fasttrap.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/fasttrap_isa.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
 #include <sys/sysmacros.h>
 #include <sys/proc.h>
 #include <sys/policy.h>
 #ifdef illumos
 #include <util/qsort.h>
 #endif
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #ifndef illumos
 #include <sys/dtrace_bsd.h>
 #include <sys/eventhandler.h>
 #include <sys/rmlock.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/u8_textprep.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 #include <cddl/dev/dtrace/dtrace_cddl.h>
 #endif
 
 /*
  * User-Land Trap-Based Tracing
  * ----------------------------
  *
  * The fasttrap provider allows DTrace consumers to instrument any user-level
  * instruction to gather data; this includes probes with semantic
  * signifigance like entry and return as well as simple offsets into the
  * function. While the specific techniques used are very ISA specific, the
  * methodology is generalizable to any architecture.
  *
  *
  * The General Methodology
  * -----------------------
  *
  * With the primary goal of tracing every user-land instruction and the
  * limitation that we can't trust user space so don't want to rely on much
  * information there, we begin by replacing the instructions we want to trace
  * with trap instructions. Each instruction we overwrite is saved into a hash
  * table keyed by process ID and pc address. When we enter the kernel due to
  * this trap instruction, we need the effects of the replaced instruction to
  * appear to have occurred before we proceed with the user thread's
  * execution.
  *
  * Each user level thread is represented by a ulwp_t structure which is
  * always easily accessible through a register. The most basic way to produce
  * the effects of the instruction we replaced is to copy that instruction out
  * to a bit of scratch space reserved in the user thread's ulwp_t structure
  * (a sort of kernel-private thread local storage), set the PC to that
  * scratch space and single step. When we reenter the kernel after single
  * stepping the instruction we must then adjust the PC to point to what would
  * normally be the next instruction. Of course, special care must be taken
  * for branches and jumps, but these represent such a small fraction of any
  * instruction set that writing the code to emulate these in the kernel is
  * not too difficult.
  *
  * Return probes may require several tracepoints to trace every return site,
  * and, conversely, each tracepoint may activate several probes (the entry
  * and offset 0 probes, for example). To solve this muliplexing problem,
  * tracepoints contain lists of probes to activate and probes contain lists
  * of tracepoints to enable. If a probe is activated, it adds its ID to
  * existing tracepoints or creates new ones as necessary.
  *
  * Most probes are activated _before_ the instruction is executed, but return
  * probes are activated _after_ the effects of the last instruction of the
  * function are visible. Return probes must be fired _after_ we have
  * single-stepped the instruction whereas all other probes are fired
  * beforehand.
  *
  *
  * Lock Ordering
  * -------------
  *
  * The lock ordering below -- both internally and with respect to the DTrace
  * framework -- is a little tricky and bears some explanation. Each provider
  * has a lock (ftp_mtx) that protects its members including reference counts
  * for enabled probes (ftp_rcount), consumers actively creating probes
  * (ftp_ccount) and USDT consumers (ftp_mcount); all three prevent a provider
  * from being freed. A provider is looked up by taking the bucket lock for the
  * provider hash table, and is returned with its lock held. The provider lock
  * may be taken in functions invoked by the DTrace framework, but may not be
  * held while calling functions in the DTrace framework.
  *
  * To ensure consistency over multiple calls to the DTrace framework, the
  * creation lock (ftp_cmtx) should be held. Naturally, the creation lock may
  * not be taken when holding the provider lock as that would create a cyclic
  * lock ordering. In situations where one would naturally take the provider
  * lock and then the creation lock, we instead up a reference count to prevent
  * the provider from disappearing, drop the provider lock, and acquire the
  * creation lock.
  *
  * Briefly:
  * 	bucket lock before provider lock
  *	DTrace before provider lock
  *	creation lock before DTrace
  *	never hold the provider lock and creation lock simultaneously
  */
 
 static d_open_t fasttrap_open;
 static d_ioctl_t fasttrap_ioctl;
 
 static struct cdevsw fasttrap_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= fasttrap_open,
 	.d_ioctl	= fasttrap_ioctl,
 	.d_name		= "fasttrap",
 };
 static struct cdev *fasttrap_cdev;
 static dtrace_meta_provider_id_t fasttrap_meta_id;
 
 static struct proc *fasttrap_cleanup_proc;
 static struct mtx fasttrap_cleanup_mtx;
 static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv;
 
 /*
  * Generation count on modifications to the global tracepoint lookup table.
  */
 static volatile uint64_t fasttrap_mod_gen;
 
 /*
  * When the fasttrap provider is loaded, fasttrap_max is set to either
  * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the
  * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD).
  * Each time a probe is created, fasttrap_total is incremented by the number
  * of tracepoints that may be associated with that probe; fasttrap_total is capped
  * at fasttrap_max.
  */
 #define	FASTTRAP_MAX_DEFAULT		250000
 static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT;
 static uint32_t fasttrap_total;
 
 /*
  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #define	FASTTRAP_TPOINTS_DEFAULT_SIZE	0x4000
 #define	FASTTRAP_PROVIDERS_DEFAULT_SIZE	0x100
 #define	FASTTRAP_PROCS_DEFAULT_SIZE	0x100
 
 #define	FASTTRAP_PID_NAME		"pid"
 
 fasttrap_hash_t			fasttrap_tpoints;
 static fasttrap_hash_t		fasttrap_provs;
 static fasttrap_hash_t		fasttrap_procs;
 
 static uint64_t			fasttrap_pid_count;	/* pid ref count */
 static kmutex_t			fasttrap_count_mtx;	/* lock on ref count */
 
 #define	FASTTRAP_ENABLE_FAIL	1
 #define	FASTTRAP_ENABLE_PARTIAL	2
 
 static int fasttrap_tracepoint_enable(proc_t *, fasttrap_probe_t *, uint_t);
 static void fasttrap_tracepoint_disable(proc_t *, fasttrap_probe_t *, uint_t);
 
 static fasttrap_provider_t *fasttrap_provider_lookup(pid_t, const char *,
     const dtrace_pattr_t *);
 static void fasttrap_provider_retire(pid_t, const char *, int);
 static void fasttrap_provider_free(fasttrap_provider_t *);
 
 static fasttrap_proc_t *fasttrap_proc_lookup(pid_t);
 static void fasttrap_proc_release(fasttrap_proc_t *);
 
 #ifndef illumos
 static void fasttrap_thread_dtor(void *, struct thread *);
 #endif
 
 #define	FASTTRAP_PROVS_INDEX(pid, name) \
 	((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask)
 
 #define	FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask)
 
 #ifndef illumos
 struct rmlock fasttrap_tp_lock;
 static eventhandler_tag fasttrap_thread_dtor_tag;
 #endif
 
 static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE;
 
 #ifdef __FreeBSD__
 SYSCTL_DECL(_kern_dtrace);
 SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters");
 SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max,
     FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes");
 SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size,
     FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table");
 #endif
 
 static int
 fasttrap_highbit(ulong_t i)
 {
 	int h = 1;
 
 	if (i == 0)
 		return (0);
 #ifdef _LP64
 	if (i & 0xffffffff00000000ul) {
 		h += 32; i >>= 32;
 	}
 #endif
 	if (i & 0xffff0000) {
 		h += 16; i >>= 16;
 	}
 	if (i & 0xff00) {
 		h += 8; i >>= 8;
 	}
 	if (i & 0xf0) {
 		h += 4; i >>= 4;
 	}
 	if (i & 0xc) {
 		h += 2; i >>= 2;
 	}
 	if (i & 0x2) {
 		h += 1;
 	}
 	return (h);
 }
 
 static uint_t
 fasttrap_hash_str(const char *p)
 {
 	unsigned int g;
 	uint_t hval = 0;
 
 	while (*p) {
 		hval = (hval << 4) + *p++;
 		if ((g = (hval & 0xf0000000)) != 0)
 			hval ^= g >> 24;
 		hval &= ~g;
 	}
 	return (hval);
 }
 
 void
 fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc)
 {
 #ifdef illumos
 	sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
 	sqp->sq_info.si_signo = SIGTRAP;
 	sqp->sq_info.si_code = TRAP_DTRACE;
 	sqp->sq_info.si_addr = (caddr_t)pc;
 
 	mutex_enter(&p->p_lock);
 	sigaddqa(p, t, sqp);
 	mutex_exit(&p->p_lock);
 
 	if (t != NULL)
 		aston(t);
 #else
 	ksiginfo_t *ksi = kmem_zalloc(sizeof (ksiginfo_t), KM_SLEEP);
 
 	ksiginfo_init(ksi);
 	ksi->ksi_signo = SIGTRAP;
 	ksi->ksi_code = TRAP_DTRACE;
 	ksi->ksi_addr = (caddr_t)pc;
 	PROC_LOCK(p);
 	(void) tdsendsignal(p, t, SIGTRAP, ksi);
 	PROC_UNLOCK(p);
 #endif
 }
 
 #ifndef illumos
 /*
  * Obtain a chunk of scratch space in the address space of the target process.
  */
 fasttrap_scrspace_t *
 fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc)
 {
 	fasttrap_scrblock_t *scrblk;
 	fasttrap_scrspace_t *scrspc;
 	struct proc *p;
 	vm_offset_t addr;
 	int error, i;
 
 	scrspc = NULL;
 	if (td->t_dtrace_sscr != NULL) {
 		/* If the thread already has scratch space, we're done. */
 		scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
 		return (scrspc);
 	}
 
 	p = td->td_proc;
 
 	mutex_enter(&fprc->ftpc_mtx);
 	if (LIST_EMPTY(&fprc->ftpc_fscr)) {
 		/*
 		 * No scratch space is available, so we'll map a new scratch
 		 * space block into the traced process' address space.
 		 */
 		addr = 0;
 		error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr,
 		    FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL,
 		    VM_PROT_ALL, 0);
 		if (error != KERN_SUCCESS)
 			goto done;
 
 		scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK);
 		scrblk->ftsb_addr = addr;
 		LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next);
 
 		/*
 		 * Carve the block up into chunks and put them on the free list.
 		 */
 		for (i = 0;
 		    i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) {
 			scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK);
 			scrspc->ftss_addr = addr +
 			    i * FASTTRAP_SCRSPACE_SIZE;
 			LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc,
 			    ftss_next);
 		}
 	}
 
 	/*
 	 * Take the first scratch chunk off the free list, put it on the
 	 * allocated list, and return its address.
 	 */
 	scrspc = LIST_FIRST(&fprc->ftpc_fscr);
 	LIST_REMOVE(scrspc, ftss_next);
 	LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next);
 
 	/*
 	 * This scratch space is reserved for use by td until the thread exits.
 	 */
 	td->t_dtrace_sscr = scrspc;
 
 done:
 	mutex_exit(&fprc->ftpc_mtx);
 
 	return (scrspc);
 }
 
 /*
  * Return any allocated per-thread scratch space chunks back to the process'
  * free list.
  */
 static void
 fasttrap_thread_dtor(void *arg __unused, struct thread *td)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc;
 	fasttrap_scrspace_t *scrspc;
 	pid_t pid;
 
 	if (td->t_dtrace_sscr == NULL)
 		return;
 
 	pid = td->td_proc->p_pid;
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	fprc = NULL;
 
 	/* Look up the fasttrap process handle for this process. */
 	mutex_enter(&bucket->ftb_mtx);
 	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
 		if (fprc->ftpc_pid == pid) {
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			break;
 		}
 	}
 	if (fprc == NULL) {
 		mutex_exit(&bucket->ftb_mtx);
 		return;
 	}
 
 	scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
 	LIST_REMOVE(scrspc, ftss_next);
 	LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next);
 
 	mutex_exit(&fprc->ftpc_mtx);
 }
 #endif
 
 /*
  * This function ensures that no threads are actively using the memory
  * associated with probes that were formerly live.
  */
 static void
 fasttrap_mod_barrier(uint64_t gen)
 {
 	int i;
 
 	if (gen < fasttrap_mod_gen)
 		return;
 
 	fasttrap_mod_gen++;
 
 #ifdef illumos
 	CPU_FOREACH(i) {
 		mutex_enter(&fasttrap_cpuc_pid_lock[i]);
 		mutex_exit(&fasttrap_cpuc_pid_lock[i]);
 	}
 #else
 	rm_wlock(&fasttrap_tp_lock);
 	rm_wunlock(&fasttrap_tp_lock);
 #endif
 }
 
 /*
  * This function performs asynchronous cleanup of fasttrap providers. The
  * Solaris implementation of this mechanism use a timeout that's activated in
  * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while
  * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler.
  * Thus we use a dedicated process to perform the cleanup when requested.
  */
 /*ARGSUSED*/
 static void
 fasttrap_pid_cleanup_cb(void *data)
 {
 	fasttrap_provider_t **fpp, *fp;
 	fasttrap_bucket_t *bucket;
 	dtrace_provider_id_t provid;
 	int i, later = 0, rval;
 
 	mtx_lock(&fasttrap_cleanup_mtx);
 	while (!fasttrap_cleanup_drain || later > 0) {
 		fasttrap_cleanup_work = 0;
 		mtx_unlock(&fasttrap_cleanup_mtx);
 
 		later = 0;
 
 		/*
 		 * Iterate over all the providers trying to remove the marked
 		 * ones. If a provider is marked but not retired, we just
 		 * have to take a crack at removing it -- it's no big deal if
 		 * we can't.
 		 */
 		for (i = 0; i < fasttrap_provs.fth_nent; i++) {
 			bucket = &fasttrap_provs.fth_table[i];
 			mutex_enter(&bucket->ftb_mtx);
 			fpp = (fasttrap_provider_t **)&bucket->ftb_data;
 
 			while ((fp = *fpp) != NULL) {
 				if (!fp->ftp_marked) {
 					fpp = &fp->ftp_next;
 					continue;
 				}
 
 				mutex_enter(&fp->ftp_mtx);
 
 				/*
 				 * If this provider has consumers actively
 				 * creating probes (ftp_ccount) or is a USDT
 				 * provider (ftp_mcount), we can't unregister
 				 * or even condense.
 				 */
 				if (fp->ftp_ccount != 0 ||
 				    fp->ftp_mcount != 0) {
 					mutex_exit(&fp->ftp_mtx);
 					fp->ftp_marked = 0;
 					continue;
 				}
 
 				if (!fp->ftp_retired || fp->ftp_rcount != 0)
 					fp->ftp_marked = 0;
 
 				mutex_exit(&fp->ftp_mtx);
 
 				/*
 				 * If we successfully unregister this
 				 * provider we can remove it from the hash
 				 * chain and free the memory. If our attempt
 				 * to unregister fails and this is a retired
 				 * provider, increment our flag to try again
 				 * pretty soon. If we've consumed more than
 				 * half of our total permitted number of
 				 * probes call dtrace_condense() to try to
 				 * clean out the unenabled probes.
 				 */
 				provid = fp->ftp_provid;
 				if ((rval = dtrace_unregister(provid)) != 0) {
 					if (fasttrap_total > fasttrap_max / 2)
 						(void) dtrace_condense(provid);
 
 					if (rval == EAGAIN)
 						fp->ftp_marked = 1;
 
 					later += fp->ftp_marked;
 					fpp = &fp->ftp_next;
 				} else {
 					*fpp = fp->ftp_next;
 					fasttrap_provider_free(fp);
 				}
 			}
 			mutex_exit(&bucket->ftb_mtx);
 		}
 		mtx_lock(&fasttrap_cleanup_mtx);
 
 		/*
 		 * If we were unable to retire a provider, try again after a
 		 * second. This situation can occur in certain circumstances
 		 * where providers cannot be unregistered even though they have
 		 * no probes enabled because of an execution of dtrace -l or
 		 * something similar.
 		 */
 		if (later > 0 || fasttrap_cleanup_work ||
 		    fasttrap_cleanup_drain) {
 			mtx_unlock(&fasttrap_cleanup_mtx);
 			pause("ftclean", hz);
 			mtx_lock(&fasttrap_cleanup_mtx);
 		} else
 			mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx,
 			    0, "ftcl", 0);
 	}
 
 	/*
 	 * Wake up the thread in fasttrap_unload() now that we're done.
 	 */
 	wakeup(&fasttrap_cleanup_drain);
 	mtx_unlock(&fasttrap_cleanup_mtx);
 
 	kthread_exit();
 }
 
 /*
  * Activates the asynchronous cleanup mechanism.
  */
 static void
 fasttrap_pid_cleanup(void)
 {
 
 	mtx_lock(&fasttrap_cleanup_mtx);
 	if (!fasttrap_cleanup_work) {
 		fasttrap_cleanup_work = 1;
 		wakeup(&fasttrap_cleanup_cv);
 	}
 	mtx_unlock(&fasttrap_cleanup_mtx);
 }
 
 /*
  * This is called from cfork() via dtrace_fasttrap_fork(). The child
  * process's address space is (roughly) a copy of the parent process's so
  * we have to remove all the instrumentation we had previously enabled in the
  * parent.
  */
 static void
 fasttrap_fork(proc_t *p, proc_t *cp)
 {
 #ifndef illumos
 	fasttrap_scrblock_t *scrblk;
 	fasttrap_proc_t *fprc = NULL;
 #endif
 	pid_t ppid = p->p_pid;
 	int i;
 
-#ifdef illumos
 	ASSERT(curproc == p);
+#ifdef illumos
 	ASSERT(p->p_proc_flag & P_PR_LOCK);
 #else
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 #endif
 #ifdef illumos
 	ASSERT(p->p_dtrace_count > 0);
 #else
-	if (p->p_dtrace_helpers) {
-		/*
-		 * dtrace_helpers_duplicate() allocates memory.
-		 */
-		_PHOLD(cp);
-		PROC_UNLOCK(p);
-		PROC_UNLOCK(cp);
-		dtrace_helpers_duplicate(p, cp);
-		PROC_LOCK(cp);
-		PROC_LOCK(p);
-		_PRELE(cp);
-	}
 	/*
 	 * This check is purposely here instead of in kern_fork.c because,
 	 * for legal resons, we cannot include the dtrace_cddl.h header
 	 * inside kern_fork.c and insert if-clause there.
 	 */
-	if (p->p_dtrace_count == 0)
+	if (p->p_dtrace_count == 0 && p->p_dtrace_helpers == NULL)
 		return;
 #endif
+
 	ASSERT(cp->p_dtrace_count == 0);
 
 	/*
 	 * This would be simpler and faster if we maintained per-process
 	 * hash tables of enabled tracepoints. It could, however, potentially
 	 * slow down execution of a tracepoint since we'd need to go
 	 * through two levels of indirection. In the future, we should
 	 * consider either maintaining per-process ancillary lists of
 	 * enabled tracepoints or hanging a pointer to a per-process hash
 	 * table of enabled tracepoints off the proc structure.
 	 */
 
 	/*
 	 * We don't have to worry about the child process disappearing
 	 * because we're in fork().
 	 */
 #ifdef illumos
 	mtx_lock_spin(&cp->p_slock);
 	sprlock_proc(cp);
 	mtx_unlock_spin(&cp->p_slock);
 #else
 	/*
 	 * fasttrap_tracepoint_remove() expects the child process to be
 	 * unlocked and the VM then expects curproc to be unlocked.
 	 */
 	_PHOLD(cp);
 	PROC_UNLOCK(cp);
 	PROC_UNLOCK(p);
+	if (p->p_dtrace_count == 0)
+		goto dup_helpers;
 #endif
 
 	/*
 	 * Iterate over every tracepoint looking for ones that belong to the
 	 * parent process, and remove each from the child process.
 	 */
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++) {
 		fasttrap_tracepoint_t *tp;
 		fasttrap_bucket_t *bucket = &fasttrap_tpoints.fth_table[i];
 
 		mutex_enter(&bucket->ftb_mtx);
 		for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 			if (tp->ftt_pid == ppid &&
 			    tp->ftt_proc->ftpc_acount != 0) {
 				int ret = fasttrap_tracepoint_remove(cp, tp);
 				ASSERT(ret == 0);
 
 				/*
 				 * The count of active providers can only be
 				 * decremented (i.e. to zero) during exec,
 				 * exit, and removal of a meta provider so it
 				 * should be impossible to drop the count
 				 * mid-fork.
 				 */
 				ASSERT(tp->ftt_proc->ftpc_acount != 0);
 #ifndef illumos
 				fprc = tp->ftt_proc;
 #endif
 			}
 		}
 		mutex_exit(&bucket->ftb_mtx);
 
 #ifndef illumos
 		/*
 		 * Unmap any scratch space inherited from the parent's address
 		 * space.
 		 */
 		if (fprc != NULL) {
 			mutex_enter(&fprc->ftpc_mtx);
 			LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) {
 				vm_map_remove(&cp->p_vmspace->vm_map,
 				    scrblk->ftsb_addr,
 				    scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE);
 			}
 			mutex_exit(&fprc->ftpc_mtx);
 		}
 #endif
 	}
 
 #ifdef illumos
 	mutex_enter(&cp->p_lock);
 	sprunlock(cp);
 #else
+dup_helpers:
+	if (p->p_dtrace_helpers != NULL)
+		dtrace_helpers_duplicate(p, cp);
 	PROC_LOCK(p);
 	PROC_LOCK(cp);
 	_PRELE(cp);
 #endif
 }
 
 /*
  * This is called from proc_exit() or from exec_common() if p_dtrace_probes
  * is set on the proc structure to indicate that there is a pid provider
  * associated with this process.
  */
 static void
 fasttrap_exec_exit(proc_t *p)
 {
 #ifndef illumos
 	struct thread *td;
 #endif
 
 #ifdef illumos
 	ASSERT(p == curproc);
 #else
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	_PHOLD(p);
 	/*
 	 * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr
 	 * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it
 	 * ourselves when a process exits.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		td->t_dtrace_sscr = NULL;
 	PROC_UNLOCK(p);
 #endif
 
 	/*
 	 * We clean up the pid provider for this process here; user-land
 	 * static probes are handled by the meta-provider remove entry point.
 	 */
 	fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0);
 #ifndef illumos
 	if (p->p_dtrace_helpers)
 		dtrace_helpers_destroy(p);
 	PROC_LOCK(p);
 	_PRELE(p);
 #endif
 }
 
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	/*
 	 * There are no "default" pid probes.
 	 */
 }
 
 static int
 fasttrap_tracepoint_enable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
 {
 	fasttrap_tracepoint_t *tp, *new_tp = NULL;
 	fasttrap_bucket_t *bucket;
 	fasttrap_id_t *id;
 	pid_t pid;
 	uintptr_t pc;
 
 	ASSERT(index < probe->ftp_ntps);
 
 	pid = probe->ftp_pid;
 	pc = probe->ftp_tps[index].fit_tp->ftt_pc;
 	id = &probe->ftp_tps[index].fit_id;
 
 	ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
 
 #ifdef illumos
 	ASSERT(!(p->p_flag & SVFORK));
 #endif
 
 	/*
 	 * Before we make any modifications, make sure we've imposed a barrier
 	 * on the generation in which this probe was last modified.
 	 */
 	fasttrap_mod_barrier(probe->ftp_gen);
 
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 
 	/*
 	 * If the tracepoint has already been enabled, just add our id to the
 	 * list of interested probes. This may be our second time through
 	 * this path in which case we'll have constructed the tracepoint we'd
 	 * like to install. If we can't find a match, and have an allocated
 	 * tracepoint ready to go, enable that one now.
 	 *
 	 * A tracepoint whose process is defunct is also considered defunct.
 	 */
 again:
 	mutex_enter(&bucket->ftb_mtx);
 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 		/*
 		 * Note that it's safe to access the active count on the
 		 * associated proc structure because we know that at least one
 		 * provider (this one) will still be around throughout this
 		 * operation.
 		 */
 		if (tp->ftt_pid != pid || tp->ftt_pc != pc ||
 		    tp->ftt_proc->ftpc_acount == 0)
 			continue;
 
 		/*
 		 * Now that we've found a matching tracepoint, it would be
 		 * a decent idea to confirm that the tracepoint is still
 		 * enabled and the trap instruction hasn't been overwritten.
 		 * Since this is a little hairy, we'll punt for now.
 		 */
 
 		/*
 		 * This can't be the first interested probe. We don't have
 		 * to worry about another thread being in the midst of
 		 * deleting this tracepoint (which would be the only valid
 		 * reason for a tracepoint to have no interested probes)
 		 * since we're holding P_PR_LOCK for this process.
 		 */
 		ASSERT(tp->ftt_ids != NULL || tp->ftt_retids != NULL);
 
 		switch (id->fti_ptype) {
 		case DTFTP_ENTRY:
 		case DTFTP_OFFSETS:
 		case DTFTP_IS_ENABLED:
 			id->fti_next = tp->ftt_ids;
 			membar_producer();
 			tp->ftt_ids = id;
 			membar_producer();
 			break;
 
 		case DTFTP_RETURN:
 		case DTFTP_POST_OFFSETS:
 			id->fti_next = tp->ftt_retids;
 			membar_producer();
 			tp->ftt_retids = id;
 			membar_producer();
 			break;
 
 		default:
 			ASSERT(0);
 		}
 
 		mutex_exit(&bucket->ftb_mtx);
 
 		if (new_tp != NULL) {
 			new_tp->ftt_ids = NULL;
 			new_tp->ftt_retids = NULL;
 		}
 
 		return (0);
 	}
 
 	/*
 	 * If we have a good tracepoint ready to go, install it now while
 	 * we have the lock held and no one can screw with us.
 	 */
 	if (new_tp != NULL) {
 		int rc = 0;
 
 		new_tp->ftt_next = bucket->ftb_data;
 		membar_producer();
 		bucket->ftb_data = new_tp;
 		membar_producer();
 		mutex_exit(&bucket->ftb_mtx);
 
 		/*
 		 * Activate the tracepoint in the ISA-specific manner.
 		 * If this fails, we need to report the failure, but
 		 * indicate that this tracepoint must still be disabled
 		 * by calling fasttrap_tracepoint_disable().
 		 */
 		if (fasttrap_tracepoint_install(p, new_tp) != 0)
 			rc = FASTTRAP_ENABLE_PARTIAL;
 
 		/*
 		 * Increment the count of the number of tracepoints active in
 		 * the victim process.
 		 */
 #ifdef illumos
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
 #endif
 		p->p_dtrace_count++;
 
 		return (rc);
 	}
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * Initialize the tracepoint that's been preallocated with the probe.
 	 */
 	new_tp = probe->ftp_tps[index].fit_tp;
 
 	ASSERT(new_tp->ftt_pid == pid);
 	ASSERT(new_tp->ftt_pc == pc);
 	ASSERT(new_tp->ftt_proc == probe->ftp_prov->ftp_proc);
 	ASSERT(new_tp->ftt_ids == NULL);
 	ASSERT(new_tp->ftt_retids == NULL);
 
 	switch (id->fti_ptype) {
 	case DTFTP_ENTRY:
 	case DTFTP_OFFSETS:
 	case DTFTP_IS_ENABLED:
 		id->fti_next = NULL;
 		new_tp->ftt_ids = id;
 		break;
 
 	case DTFTP_RETURN:
 	case DTFTP_POST_OFFSETS:
 		id->fti_next = NULL;
 		new_tp->ftt_retids = id;
 		break;
 
 	default:
 		ASSERT(0);
 	}
 
 #ifdef __FreeBSD__
 	if (SV_PROC_FLAG(p, SV_LP64))
 		p->p_model = DATAMODEL_LP64;
 	else
 		p->p_model = DATAMODEL_ILP32;
 #endif
 
 	/*
 	 * If the ISA-dependent initialization goes to plan, go back to the
 	 * beginning and try to install this freshly made tracepoint.
 	 */
 	if (fasttrap_tracepoint_init(p, new_tp, pc, id->fti_ptype) == 0)
 		goto again;
 
 	new_tp->ftt_ids = NULL;
 	new_tp->ftt_retids = NULL;
 
 	return (FASTTRAP_ENABLE_FAIL);
 }
 
 static void
 fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_provider_t *provider = probe->ftp_prov;
 	fasttrap_tracepoint_t **pp, *tp;
 	fasttrap_id_t *id, **idp = NULL;
 	pid_t pid;
 	uintptr_t pc;
 
 	ASSERT(index < probe->ftp_ntps);
 
 	pid = probe->ftp_pid;
 	pc = probe->ftp_tps[index].fit_tp->ftt_pc;
 	id = &probe->ftp_tps[index].fit_id;
 
 	ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
 
 	/*
 	 * Find the tracepoint and make sure that our id is one of the
 	 * ones registered with it.
 	 */
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 	mutex_enter(&bucket->ftb_mtx);
 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 		if (tp->ftt_pid == pid && tp->ftt_pc == pc &&
 		    tp->ftt_proc == provider->ftp_proc)
 			break;
 	}
 
 	/*
 	 * If we somehow lost this tracepoint, we're in a world of hurt.
 	 */
 	ASSERT(tp != NULL);
 
 	switch (id->fti_ptype) {
 	case DTFTP_ENTRY:
 	case DTFTP_OFFSETS:
 	case DTFTP_IS_ENABLED:
 		ASSERT(tp->ftt_ids != NULL);
 		idp = &tp->ftt_ids;
 		break;
 
 	case DTFTP_RETURN:
 	case DTFTP_POST_OFFSETS:
 		ASSERT(tp->ftt_retids != NULL);
 		idp = &tp->ftt_retids;
 		break;
 
 	default:
 		ASSERT(0);
 	}
 
 	while ((*idp)->fti_probe != probe) {
 		idp = &(*idp)->fti_next;
 		ASSERT(*idp != NULL);
 	}
 
 	id = *idp;
 	*idp = id->fti_next;
 	membar_producer();
 
 	ASSERT(id->fti_probe == probe);
 
 	/*
 	 * If there are other registered enablings of this tracepoint, we're
 	 * all done, but if this was the last probe assocated with this
 	 * this tracepoint, we need to remove and free it.
 	 */
 	if (tp->ftt_ids != NULL || tp->ftt_retids != NULL) {
 
 		/*
 		 * If the current probe's tracepoint is in use, swap it
 		 * for an unused tracepoint.
 		 */
 		if (tp == probe->ftp_tps[index].fit_tp) {
 			fasttrap_probe_t *tmp_probe;
 			fasttrap_tracepoint_t **tmp_tp;
 			uint_t tmp_index;
 
 			if (tp->ftt_ids != NULL) {
 				tmp_probe = tp->ftt_ids->fti_probe;
 				/* LINTED - alignment */
 				tmp_index = FASTTRAP_ID_INDEX(tp->ftt_ids);
 				tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp;
 			} else {
 				tmp_probe = tp->ftt_retids->fti_probe;
 				/* LINTED - alignment */
 				tmp_index = FASTTRAP_ID_INDEX(tp->ftt_retids);
 				tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp;
 			}
 
 			ASSERT(*tmp_tp != NULL);
 			ASSERT(*tmp_tp != probe->ftp_tps[index].fit_tp);
 			ASSERT((*tmp_tp)->ftt_ids == NULL);
 			ASSERT((*tmp_tp)->ftt_retids == NULL);
 
 			probe->ftp_tps[index].fit_tp = *tmp_tp;
 			*tmp_tp = tp;
 		}
 
 		mutex_exit(&bucket->ftb_mtx);
 
 		/*
 		 * Tag the modified probe with the generation in which it was
 		 * changed.
 		 */
 		probe->ftp_gen = fasttrap_mod_gen;
 		return;
 	}
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * We can't safely remove the tracepoint from the set of active
 	 * tracepoints until we've actually removed the fasttrap instruction
 	 * from the process's text. We can, however, operate on this
 	 * tracepoint secure in the knowledge that no other thread is going to
 	 * be looking at it since we hold P_PR_LOCK on the process if it's
 	 * live or we hold the provider lock on the process if it's dead and
 	 * gone.
 	 */
 
 	/*
 	 * We only need to remove the actual instruction if we're looking
 	 * at an existing process
 	 */
 	if (p != NULL) {
 		/*
 		 * If we fail to restore the instruction we need to kill
 		 * this process since it's in a completely unrecoverable
 		 * state.
 		 */
 		if (fasttrap_tracepoint_remove(p, tp) != 0)
 			fasttrap_sigtrap(p, NULL, pc);
 
 		/*
 		 * Decrement the count of the number of tracepoints active
 		 * in the victim process.
 		 */
 #ifdef illumos
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
 #endif
 		p->p_dtrace_count--;
 	}
 
 	/*
 	 * Remove the probe from the hash table of active tracepoints.
 	 */
 	mutex_enter(&bucket->ftb_mtx);
 	pp = (fasttrap_tracepoint_t **)&bucket->ftb_data;
 	ASSERT(*pp != NULL);
 	while (*pp != tp) {
 		pp = &(*pp)->ftt_next;
 		ASSERT(*pp != NULL);
 	}
 
 	*pp = tp->ftt_next;
 	membar_producer();
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * Tag the modified probe with the generation in which it was changed.
 	 */
 	probe->ftp_gen = fasttrap_mod_gen;
 }
 
 static void
 fasttrap_enable_callbacks(void)
 {
 	/*
 	 * We don't have to play the rw lock game here because we're
 	 * providing something rather than taking something away --
 	 * we can be sure that no threads have tried to follow this
 	 * function pointer yet.
 	 */
 	mutex_enter(&fasttrap_count_mtx);
 	if (fasttrap_pid_count == 0) {
 		ASSERT(dtrace_pid_probe_ptr == NULL);
 		ASSERT(dtrace_return_probe_ptr == NULL);
 		dtrace_pid_probe_ptr = &fasttrap_pid_probe;
 		dtrace_return_probe_ptr = &fasttrap_return_probe;
 	}
 	ASSERT(dtrace_pid_probe_ptr == &fasttrap_pid_probe);
 	ASSERT(dtrace_return_probe_ptr == &fasttrap_return_probe);
 	fasttrap_pid_count++;
 	mutex_exit(&fasttrap_count_mtx);
 }
 
 static void
 fasttrap_disable_callbacks(void)
 {
 #ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 
 
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count > 0);
 	fasttrap_pid_count--;
 	if (fasttrap_pid_count == 0) {
 #ifdef illumos
 		cpu_t *cur, *cpu = CPU;
 
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_enter(&cur->cpu_ft_lock, RW_WRITER);
 		}
 #endif
 		dtrace_pid_probe_ptr = NULL;
 		dtrace_return_probe_ptr = NULL;
 #ifdef illumos
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_exit(&cur->cpu_ft_lock);
 		}
 #endif
 	}
 	mutex_exit(&fasttrap_count_mtx);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
 	proc_t *p = NULL;
 	int i, rc;
 
 	ASSERT(probe != NULL);
 	ASSERT(!probe->ftp_enabled);
 	ASSERT(id == probe->ftp_id);
 #ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 
 	/*
 	 * Increment the count of enabled probes on this probe's provider;
 	 * the provider can't go away while the probe still exists. We
 	 * must increment this even if we aren't able to properly enable
 	 * this probe.
 	 */
 	mutex_enter(&probe->ftp_prov->ftp_mtx);
 	probe->ftp_prov->ftp_rcount++;
 	mutex_exit(&probe->ftp_prov->ftp_mtx);
 
 	/*
 	 * If this probe's provider is retired (meaning it was valid in a
 	 * previously exec'ed incarnation of this address space), bail out. The
 	 * provider can't go away while we're in this code path.
 	 */
 	if (probe->ftp_prov->ftp_retired)
 		return;
 
 	/*
 	 * If we can't find the process, it may be that we're in the context of
 	 * a fork in which the traced process is being born and we're copying
 	 * USDT probes. Otherwise, the process is gone so bail.
 	 */
 #ifdef illumos
 	if ((p = sprlock(probe->ftp_pid)) == NULL) {
 		if ((curproc->p_flag & SFORKING) == 0)
 			return;
 
 		mutex_enter(&pidlock);
 		p = prfind(probe->ftp_pid);
 
 		if (p == NULL) {
 			/*
 			 * So it's not that the target process is being born,
 			 * it's that it isn't there at all (and we simply
 			 * happen to be forking).  Anyway, we know that the
 			 * target is definitely gone, so bail out.
 			 */
 			mutex_exit(&pidlock);
 			return (0);
 		}
 
 		/*
 		 * Confirm that curproc is indeed forking the process in which
 		 * we're trying to enable probes.
 		 */
 		ASSERT(p->p_parent == curproc);
 		ASSERT(p->p_stat == SIDL);
 
 		mutex_enter(&p->p_lock);
 		mutex_exit(&pidlock);
 
 		sprlock_proc(p);
 	}
 
 	ASSERT(!(p->p_flag & SVFORK));
 	mutex_exit(&p->p_lock);
 #else
 	if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0)
 		return;
 #endif
 
 	/*
 	 * We have to enable the trap entry point before any user threads have
 	 * the chance to execute the trap instruction we're about to place
 	 * in their process's text.
 	 */
 	fasttrap_enable_callbacks();
 
 	/*
 	 * Enable all the tracepoints and add this probe's id to each
 	 * tracepoint's list of active probes.
 	 */
 	for (i = 0; i < probe->ftp_ntps; i++) {
 		if ((rc = fasttrap_tracepoint_enable(p, probe, i)) != 0) {
 			/*
 			 * If enabling the tracepoint failed completely,
 			 * we don't have to disable it; if the failure
 			 * was only partial we must disable it.
 			 */
 			if (rc == FASTTRAP_ENABLE_FAIL)
 				i--;
 			else
 				ASSERT(rc == FASTTRAP_ENABLE_PARTIAL);
 
 			/*
 			 * Back up and pull out all the tracepoints we've
 			 * created so far for this probe.
 			 */
 			while (i >= 0) {
 				fasttrap_tracepoint_disable(p, probe, i);
 				i--;
 			}
 
 #ifdef illumos
 			mutex_enter(&p->p_lock);
 			sprunlock(p);
 #else
 			PRELE(p);
 #endif
 
 			/*
 			 * Since we're not actually enabling this probe,
 			 * drop our reference on the trap table entry.
 			 */
 			fasttrap_disable_callbacks();
 			return;
 		}
 	}
 #ifdef illumos
 	mutex_enter(&p->p_lock);
 	sprunlock(p);
 #else
 	PRELE(p);
 #endif
 
 	probe->ftp_enabled = 1;
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
 	fasttrap_provider_t *provider = probe->ftp_prov;
 	proc_t *p;
 	int i, whack = 0;
 
 	ASSERT(id == probe->ftp_id);
 
 	mutex_enter(&provider->ftp_mtx);
 
 	/*
 	 * We won't be able to acquire a /proc-esque lock on the process
 	 * iff the process is dead and gone. In this case, we rely on the
 	 * provider lock as a point of mutual exclusion to prevent other
 	 * DTrace consumers from disabling this probe.
 	 */
 	if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0)
 		p = NULL;
 
 	/*
 	 * Disable all the associated tracepoints (for fully enabled probes).
 	 */
 	if (probe->ftp_enabled) {
 		for (i = 0; i < probe->ftp_ntps; i++) {
 			fasttrap_tracepoint_disable(p, probe, i);
 		}
 	}
 
 	ASSERT(provider->ftp_rcount > 0);
 	provider->ftp_rcount--;
 
 	if (p != NULL) {
 		/*
 		 * Even though we may not be able to remove it entirely, we
 		 * mark this retired provider to get a chance to remove some
 		 * of the associated probes.
 		 */
 		if (provider->ftp_retired && !provider->ftp_marked)
 			whack = provider->ftp_marked = 1;
 		mutex_exit(&provider->ftp_mtx);
 	} else {
 		/*
 		 * If the process is dead, we're just waiting for the
 		 * last probe to be disabled to be able to free it.
 		 */
 		if (provider->ftp_rcount == 0 && !provider->ftp_marked)
 			whack = provider->ftp_marked = 1;
 		mutex_exit(&provider->ftp_mtx);
 	}
 
 	if (whack)
 		fasttrap_pid_cleanup();
 
 #ifdef __FreeBSD__
 	if (p != NULL)
 		PRELE(p);
 #endif
 	if (!probe->ftp_enabled)
 		return;
 
 	probe->ftp_enabled = 0;
 
 #ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 	fasttrap_disable_callbacks();
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_getargdesc(void *arg, dtrace_id_t id, void *parg,
     dtrace_argdesc_t *desc)
 {
 	fasttrap_probe_t *probe = parg;
 	char *str;
 	int i, ndx;
 
 	desc->dtargd_native[0] = '\0';
 	desc->dtargd_xlate[0] = '\0';
 
 	if (probe->ftp_prov->ftp_retired != 0 ||
 	    desc->dtargd_ndx >= probe->ftp_nargs) {
 		desc->dtargd_ndx = DTRACE_ARGNONE;
 		return;
 	}
 
 	ndx = (probe->ftp_argmap != NULL) ?
 	    probe->ftp_argmap[desc->dtargd_ndx] : desc->dtargd_ndx;
 
 	str = probe->ftp_ntypes;
 	for (i = 0; i < ndx; i++) {
 		str += strlen(str) + 1;
 	}
 
 	ASSERT(strlen(str + 1) < sizeof (desc->dtargd_native));
 	(void) strcpy(desc->dtargd_native, str);
 
 	if (probe->ftp_xtypes == NULL)
 		return;
 
 	str = probe->ftp_xtypes;
 	for (i = 0; i < desc->dtargd_ndx; i++) {
 		str += strlen(str) + 1;
 	}
 
 	ASSERT(strlen(str + 1) < sizeof (desc->dtargd_xlate));
 	(void) strcpy(desc->dtargd_xlate, str);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
 	int i;
 	size_t size;
 
 	ASSERT(probe != NULL);
 	ASSERT(!probe->ftp_enabled);
 	ASSERT(fasttrap_total >= probe->ftp_ntps);
 
 	atomic_add_32(&fasttrap_total, -probe->ftp_ntps);
 	size = offsetof(fasttrap_probe_t, ftp_tps[probe->ftp_ntps]);
 
 	if (probe->ftp_gen + 1 >= fasttrap_mod_gen)
 		fasttrap_mod_barrier(probe->ftp_gen);
 
 	for (i = 0; i < probe->ftp_ntps; i++) {
 		kmem_free(probe->ftp_tps[i].fit_tp,
 		    sizeof (fasttrap_tracepoint_t));
 	}
 
 	kmem_free(probe, size);
 }
 
 
 static const dtrace_pattr_t pid_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 };
 
 static dtrace_pops_t pid_pops = {
 	.dtps_provide =		fasttrap_pid_provide,
 	.dtps_provide_module =	NULL,
 	.dtps_enable =		fasttrap_pid_enable,
 	.dtps_disable =		fasttrap_pid_disable,
 	.dtps_suspend =		NULL,
 	.dtps_resume =		NULL,
 	.dtps_getargdesc =	fasttrap_pid_getargdesc,
 	.dtps_getargval =	fasttrap_pid_getarg,
 	.dtps_usermode =	NULL,
 	.dtps_destroy =		fasttrap_pid_destroy
 };
 
 static dtrace_pops_t usdt_pops = {
 	.dtps_provide =		fasttrap_pid_provide,
 	.dtps_provide_module =	NULL,
 	.dtps_enable =		fasttrap_pid_enable,
 	.dtps_disable =		fasttrap_pid_disable,
 	.dtps_suspend =		NULL,
 	.dtps_resume =		NULL,
 	.dtps_getargdesc =	fasttrap_pid_getargdesc,
 	.dtps_getargval =	fasttrap_usdt_getarg,
 	.dtps_usermode =	NULL,
 	.dtps_destroy =		fasttrap_pid_destroy
 };
 
 static fasttrap_proc_t *
 fasttrap_proc_lookup(pid_t pid)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, *new_fprc;
 
 
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
 		if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) {
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
 			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
 			return (fprc);
 		}
 	}
 
 	/*
 	 * Drop the bucket lock so we don't try to perform a sleeping
 	 * allocation under it.
 	 */
 	mutex_exit(&bucket->ftb_mtx);
 
 	new_fprc = kmem_zalloc(sizeof (fasttrap_proc_t), KM_SLEEP);
 	new_fprc->ftpc_pid = pid;
 	new_fprc->ftpc_rcount = 1;
 	new_fprc->ftpc_acount = 1;
 #ifndef illumos
 	mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT,
 	    NULL);
 #endif
 
 	mutex_enter(&bucket->ftb_mtx);
 
 	/*
 	 * Take another lap through the list to make sure a proc hasn't
 	 * been created for this pid while we weren't under the bucket lock.
 	 */
 	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
 		if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) {
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
 			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
 			kmem_free(new_fprc, sizeof (fasttrap_proc_t));
 
 			return (fprc);
 		}
 	}
 
 	new_fprc->ftpc_next = bucket->ftb_data;
 	bucket->ftb_data = new_fprc;
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	return (new_fprc);
 }
 
 static void
 fasttrap_proc_release(fasttrap_proc_t *proc)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, **fprcp;
 	pid_t pid = proc->ftpc_pid;
 #ifndef illumos
 	fasttrap_scrblock_t *scrblk, *scrblktmp;
 	fasttrap_scrspace_t *scrspc, *scrspctmp;
 	struct proc *p;
 	struct thread *td;
 #endif
 
 	mutex_enter(&proc->ftpc_mtx);
 
 	ASSERT(proc->ftpc_rcount != 0);
 	ASSERT(proc->ftpc_acount <= proc->ftpc_rcount);
 
 	if (--proc->ftpc_rcount != 0) {
 		mutex_exit(&proc->ftpc_mtx);
 		return;
 	}
 
 #ifndef illumos
 	/*
 	 * Free all structures used to manage per-thread scratch space.
 	 */
 	LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next,
 	    scrblktmp) {
 		LIST_REMOVE(scrblk, ftsb_next);
 		free(scrblk, M_SOLARIS);
 	}
 	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) {
 		LIST_REMOVE(scrspc, ftss_next);
 		free(scrspc, M_SOLARIS);
 	}
 	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) {
 		LIST_REMOVE(scrspc, ftss_next);
 		free(scrspc, M_SOLARIS);
 	}
 
 	if ((p = pfind(pid)) != NULL) {
 		FOREACH_THREAD_IN_PROC(p, td)
 			td->t_dtrace_sscr = NULL;
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	mutex_exit(&proc->ftpc_mtx);
 
 	/*
 	 * There should definitely be no live providers associated with this
 	 * process at this point.
 	 */
 	ASSERT(proc->ftpc_acount == 0);
 
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	fprcp = (fasttrap_proc_t **)&bucket->ftb_data;
 	while ((fprc = *fprcp) != NULL) {
 		if (fprc == proc)
 			break;
 
 		fprcp = &fprc->ftpc_next;
 	}
 
 	/*
 	 * Something strange has happened if we can't find the proc.
 	 */
 	ASSERT(fprc != NULL);
 
 	*fprcp = fprc->ftpc_next;
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	kmem_free(fprc, sizeof (fasttrap_proc_t));
 }
 
 /*
  * Lookup a fasttrap-managed provider based on its name and associated pid.
  * If the pattr argument is non-NULL, this function instantiates the provider
  * if it doesn't exist otherwise it returns NULL. The provider is returned
  * with its lock held.
  */
 static fasttrap_provider_t *
 fasttrap_provider_lookup(pid_t pid, const char *name,
     const dtrace_pattr_t *pattr)
 {
 	fasttrap_provider_t *fp, *new_fp = NULL;
 	fasttrap_bucket_t *bucket;
 	char provname[DTRACE_PROVNAMELEN];
 	proc_t *p;
 	cred_t *cred;
 
 	ASSERT(strlen(name) < sizeof (fp->ftp_name));
 	ASSERT(pattr != NULL);
 
 	bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	/*
 	 * Take a lap through the list and return the match if we find it.
 	 */
 	for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
 		if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
 		    !fp->ftp_retired) {
 			mutex_enter(&fp->ftp_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			return (fp);
 		}
 	}
 
 	/*
 	 * Drop the bucket lock so we don't try to perform a sleeping
 	 * allocation under it.
 	 */
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * Make sure the process exists, isn't a child created as the result
 	 * of a vfork(2), and isn't a zombie (but may be in fork).
 	 */
 	if ((p = pfind(pid)) == NULL)
 		return (NULL);
 
 	/*
 	 * Increment p_dtrace_probes so that the process knows to inform us
 	 * when it exits or execs. fasttrap_provider_free() decrements this
 	 * when we're done with this provider.
 	 */
 	p->p_dtrace_probes++;
 
 	/*
 	 * Grab the credentials for this process so we have
 	 * something to pass to dtrace_register().
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	crhold(p->p_ucred);
 	cred = p->p_ucred;
 	PROC_UNLOCK(p);
 
 	new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
 	new_fp->ftp_pid = pid;
 	new_fp->ftp_proc = fasttrap_proc_lookup(pid);
 #ifndef illumos
 	mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL);
 	mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL);
 #endif
 
 	ASSERT(new_fp->ftp_proc != NULL);
 
 	mutex_enter(&bucket->ftb_mtx);
 
 	/*
 	 * Take another lap through the list to make sure a provider hasn't
 	 * been created for this pid while we weren't under the bucket lock.
 	 */
 	for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
 		if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
 		    !fp->ftp_retired) {
 			mutex_enter(&fp->ftp_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fasttrap_provider_free(new_fp);
 			crfree(cred);
 			return (fp);
 		}
 	}
 
 	(void) strcpy(new_fp->ftp_name, name);
 
 	/*
 	 * Fail and return NULL if either the provider name is too long
 	 * or we fail to register this new provider with the DTrace
 	 * framework. Note that this is the only place we ever construct
 	 * the full provider name -- we keep it in pieces in the provider
 	 * structure.
 	 */
 	if (snprintf(provname, sizeof (provname), "%s%u", name, (uint_t)pid) >=
 	    sizeof (provname) ||
 	    dtrace_register(provname, pattr,
 	    DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER, cred,
 	    pattr == &pid_attr ? &pid_pops : &usdt_pops, new_fp,
 	    &new_fp->ftp_provid) != 0) {
 		mutex_exit(&bucket->ftb_mtx);
 		fasttrap_provider_free(new_fp);
 		crfree(cred);
 		return (NULL);
 	}
 
 	new_fp->ftp_next = bucket->ftb_data;
 	bucket->ftb_data = new_fp;
 
 	mutex_enter(&new_fp->ftp_mtx);
 	mutex_exit(&bucket->ftb_mtx);
 
 	crfree(cred);
 	return (new_fp);
 }
 
 static void
 fasttrap_provider_free(fasttrap_provider_t *provider)
 {
 	pid_t pid = provider->ftp_pid;
 	proc_t *p;
 
 	/*
 	 * There need to be no associated enabled probes, no consumers
 	 * creating probes, and no meta providers referencing this provider.
 	 */
 	ASSERT(provider->ftp_rcount == 0);
 	ASSERT(provider->ftp_ccount == 0);
 	ASSERT(provider->ftp_mcount == 0);
 
 	/*
 	 * If this provider hasn't been retired, we need to explicitly drop the
 	 * count of active providers on the associated process structure.
 	 */
 	if (!provider->ftp_retired) {
 		atomic_dec_64(&provider->ftp_proc->ftpc_acount);
 		ASSERT(provider->ftp_proc->ftpc_acount <
 		    provider->ftp_proc->ftpc_rcount);
 	}
 
 	fasttrap_proc_release(provider->ftp_proc);
 
 #ifndef illumos
 	mutex_destroy(&provider->ftp_mtx);
 	mutex_destroy(&provider->ftp_cmtx);
 #endif
 	kmem_free(provider, sizeof (fasttrap_provider_t));
 
 	/*
 	 * Decrement p_dtrace_probes on the process whose provider we're
 	 * freeing. We don't have to worry about clobbering somone else's
 	 * modifications to it because we have locked the bucket that
 	 * corresponds to this process's hash chain in the provider hash
 	 * table. Don't sweat it if we can't find the process.
 	 */
 	if ((p = pfind(pid)) == NULL) {
 		return;
 	}
 
 	p->p_dtrace_probes--;
 #ifndef illumos
 	PROC_UNLOCK(p);
 #endif
 }
 
 static void
 fasttrap_provider_retire(pid_t pid, const char *name, int mprov)
 {
 	fasttrap_provider_t *fp;
 	fasttrap_bucket_t *bucket;
 	dtrace_provider_id_t provid;
 
 	ASSERT(strlen(name) < sizeof (fp->ftp_name));
 
 	bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
 		if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
 		    !fp->ftp_retired)
 			break;
 	}
 
 	if (fp == NULL) {
 		mutex_exit(&bucket->ftb_mtx);
 		return;
 	}
 
 	mutex_enter(&fp->ftp_mtx);
 	ASSERT(!mprov || fp->ftp_mcount > 0);
 	if (mprov && --fp->ftp_mcount != 0)  {
 		mutex_exit(&fp->ftp_mtx);
 		mutex_exit(&bucket->ftb_mtx);
 		return;
 	}
 
 	/*
 	 * Mark the provider to be removed in our post-processing step, mark it
 	 * retired, and drop the active count on its proc. Marking it indicates
 	 * that we should try to remove it; setting the retired flag indicates
 	 * that we're done with this provider; dropping the active the proc
 	 * releases our hold, and when this reaches zero (as it will during
 	 * exit or exec) the proc and associated providers become defunct.
 	 *
 	 * We obviously need to take the bucket lock before the provider lock
 	 * to perform the lookup, but we need to drop the provider lock
 	 * before calling into the DTrace framework since we acquire the
 	 * provider lock in callbacks invoked from the DTrace framework. The
 	 * bucket lock therefore protects the integrity of the provider hash
 	 * table.
 	 */
 	atomic_dec_64(&fp->ftp_proc->ftpc_acount);
 	ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
 
 	fp->ftp_retired = 1;
 	fp->ftp_marked = 1;
 	provid = fp->ftp_provid;
 	mutex_exit(&fp->ftp_mtx);
 
 	/*
 	 * We don't have to worry about invalidating the same provider twice
 	 * since fasttrap_provider_lookup() will ignore provider that have
 	 * been marked as retired.
 	 */
 	dtrace_invalidate(provid);
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	fasttrap_pid_cleanup();
 }
 
 static int
 fasttrap_uint32_cmp(const void *ap, const void *bp)
 {
 	return (*(const uint32_t *)ap - *(const uint32_t *)bp);
 }
 
 static int
 fasttrap_uint64_cmp(const void *ap, const void *bp)
 {
 	return (*(const uint64_t *)ap - *(const uint64_t *)bp);
 }
 
 static int
 fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
 {
 	fasttrap_provider_t *provider;
 	fasttrap_probe_t *pp;
 	fasttrap_tracepoint_t *tp;
 	char *name;
 	int i, aframes = 0, whack;
 
 	/*
 	 * There needs to be at least one desired trace point.
 	 */
 	if (pdata->ftps_noffs == 0)
 		return (EINVAL);
 
 	switch (pdata->ftps_type) {
 	case DTFTP_ENTRY:
 		name = "entry";
 		aframes = FASTTRAP_ENTRY_AFRAMES;
 		break;
 	case DTFTP_RETURN:
 		name = "return";
 		aframes = FASTTRAP_RETURN_AFRAMES;
 		break;
 	case DTFTP_OFFSETS:
 		name = NULL;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if ((provider = fasttrap_provider_lookup(pdata->ftps_pid,
 	    FASTTRAP_PID_NAME, &pid_attr)) == NULL)
 		return (ESRCH);
 
 	/*
 	 * Increment this reference count to indicate that a consumer is
 	 * actively adding a new probe associated with this provider. This
 	 * prevents the provider from being deleted -- we'll need to check
 	 * for pending deletions when we drop this reference count.
 	 */
 	provider->ftp_ccount++;
 	mutex_exit(&provider->ftp_mtx);
 
 	/*
 	 * Grab the creation lock to ensure consistency between calls to
 	 * dtrace_probe_lookup() and dtrace_probe_create() in the face of
 	 * other threads creating probes. We must drop the provider lock
 	 * before taking this lock to avoid a three-way deadlock with the
 	 * DTrace framework.
 	 */
 	mutex_enter(&provider->ftp_cmtx);
 
 	if (name == NULL) {
 		for (i = 0; i < pdata->ftps_noffs; i++) {
 			char name_str[17];
 
 			(void) sprintf(name_str, "%llx",
 			    (unsigned long long)pdata->ftps_offs[i]);
 
 			if (dtrace_probe_lookup(provider->ftp_provid,
 			    pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
 				continue;
 
 			atomic_inc_32(&fasttrap_total);
 
 			if (fasttrap_total > fasttrap_max) {
 				atomic_dec_32(&fasttrap_total);
 				goto no_mem;
 			}
 
 			pp = kmem_zalloc(sizeof (fasttrap_probe_t), KM_SLEEP);
 
 			pp->ftp_prov = provider;
 			pp->ftp_faddr = pdata->ftps_pc;
 			pp->ftp_fsize = pdata->ftps_size;
 			pp->ftp_pid = pdata->ftps_pid;
 			pp->ftp_ntps = 1;
 
 			tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t),
 			    KM_SLEEP);
 
 			tp->ftt_proc = provider->ftp_proc;
 			tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
 			tp->ftt_pid = pdata->ftps_pid;
 
 			pp->ftp_tps[0].fit_tp = tp;
 			pp->ftp_tps[0].fit_id.fti_probe = pp;
 			pp->ftp_tps[0].fit_id.fti_ptype = pdata->ftps_type;
 
 			pp->ftp_id = dtrace_probe_create(provider->ftp_provid,
 			    pdata->ftps_mod, pdata->ftps_func, name_str,
 			    FASTTRAP_OFFSET_AFRAMES, pp);
 		}
 
 	} else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod,
 	    pdata->ftps_func, name) == 0) {
 		atomic_add_32(&fasttrap_total, pdata->ftps_noffs);
 
 		if (fasttrap_total > fasttrap_max) {
 			atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
 			goto no_mem;
 		}
 
 		/*
 		 * Make sure all tracepoint program counter values are unique.
 		 * We later assume that each probe has exactly one tracepoint
 		 * for a given pc.
 		 */
 		qsort(pdata->ftps_offs, pdata->ftps_noffs,
 		    sizeof (uint64_t), fasttrap_uint64_cmp);
 		for (i = 1; i < pdata->ftps_noffs; i++) {
 			if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1])
 				continue;
 
 			atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
 			goto no_mem;
 		}
 
 		ASSERT(pdata->ftps_noffs > 0);
 		pp = kmem_zalloc(offsetof(fasttrap_probe_t,
 		    ftp_tps[pdata->ftps_noffs]), KM_SLEEP);
 
 		pp->ftp_prov = provider;
 		pp->ftp_faddr = pdata->ftps_pc;
 		pp->ftp_fsize = pdata->ftps_size;
 		pp->ftp_pid = pdata->ftps_pid;
 		pp->ftp_ntps = pdata->ftps_noffs;
 
 		for (i = 0; i < pdata->ftps_noffs; i++) {
 			tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t),
 			    KM_SLEEP);
 
 			tp->ftt_proc = provider->ftp_proc;
 			tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
 			tp->ftt_pid = pdata->ftps_pid;
 
 			pp->ftp_tps[i].fit_tp = tp;
 			pp->ftp_tps[i].fit_id.fti_probe = pp;
 			pp->ftp_tps[i].fit_id.fti_ptype = pdata->ftps_type;
 		}
 
 		pp->ftp_id = dtrace_probe_create(provider->ftp_provid,
 		    pdata->ftps_mod, pdata->ftps_func, name, aframes, pp);
 	}
 
 	mutex_exit(&provider->ftp_cmtx);
 
 	/*
 	 * We know that the provider is still valid since we incremented the
 	 * creation reference count. If someone tried to clean up this provider
 	 * while we were using it (e.g. because the process called exec(2) or
 	 * exit(2)), take note of that and try to clean it up now.
 	 */
 	mutex_enter(&provider->ftp_mtx);
 	provider->ftp_ccount--;
 	whack = provider->ftp_retired;
 	mutex_exit(&provider->ftp_mtx);
 
 	if (whack)
 		fasttrap_pid_cleanup();
 
 	return (0);
 
 no_mem:
 	/*
 	 * If we've exhausted the allowable resources, we'll try to remove
 	 * this provider to free some up. This is to cover the case where
 	 * the user has accidentally created many more probes than was
 	 * intended (e.g. pid123:::).
 	 */
 	mutex_exit(&provider->ftp_cmtx);
 	mutex_enter(&provider->ftp_mtx);
 	provider->ftp_ccount--;
 	provider->ftp_marked = 1;
 	mutex_exit(&provider->ftp_mtx);
 
 	fasttrap_pid_cleanup();
 
 	return (ENOMEM);
 }
 
 /*ARGSUSED*/
 static void *
 fasttrap_meta_provide(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid)
 {
 	fasttrap_provider_t *provider;
 
 	/*
 	 * A 32-bit unsigned integer (like a pid for example) can be
 	 * expressed in 10 or fewer decimal digits. Make sure that we'll
 	 * have enough space for the provider name.
 	 */
 	if (strlen(dhpv->dthpv_provname) + 10 >=
 	    sizeof (provider->ftp_name)) {
 		printf("failed to instantiate provider %s: "
 		    "name too long to accomodate pid", dhpv->dthpv_provname);
 		return (NULL);
 	}
 
 	/*
 	 * Don't let folks spoof the true pid provider.
 	 */
 	if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) {
 		printf("failed to instantiate provider %s: "
 		    "%s is an invalid name", dhpv->dthpv_provname,
 		    FASTTRAP_PID_NAME);
 		return (NULL);
 	}
 
 	/*
 	 * The highest stability class that fasttrap supports is ISA; cap
 	 * the stability of the new provider accordingly.
 	 */
 	if (dhpv->dthpv_pattr.dtpa_provider.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_provider.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_mod.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_mod.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_func.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_func.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_name.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_name.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_args.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_args.dtat_class = DTRACE_CLASS_ISA;
 
 	if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname,
 	    &dhpv->dthpv_pattr)) == NULL) {
 		printf("failed to instantiate provider %s for "
 		    "process %u",  dhpv->dthpv_provname, (uint_t)pid);
 		return (NULL);
 	}
 
 	/*
 	 * Up the meta provider count so this provider isn't removed until
 	 * the meta provider has been told to remove it.
 	 */
 	provider->ftp_mcount++;
 
 	mutex_exit(&provider->ftp_mtx);
 
 	return (provider);
 }
 
 /*
  * We know a few things about our context here:  we know that the probe being
  * created doesn't already exist (DTrace won't load DOF at the same address
  * twice, even if explicitly told to do so) and we know that we are
  * single-threaded with respect to the meta provider machinery. Knowing that
  * this is a new probe and that there is no way for us to race with another
  * operation on this provider allows us an important optimization: we need not
  * lookup a probe before adding it.  Saving this lookup is important because
  * this code is in the fork path for processes with USDT probes, and lookups
  * here are potentially very expensive because of long hash conflicts on
  * module, function and name (DTrace doesn't hash on provider name).
  */
 /*ARGSUSED*/
 static void
 fasttrap_meta_create_probe(void *arg, void *parg,
     dtrace_helper_probedesc_t *dhpb)
 {
 	fasttrap_provider_t *provider = parg;
 	fasttrap_probe_t *pp;
 	fasttrap_tracepoint_t *tp;
 	int i, j;
 	uint32_t ntps;
 
 	/*
 	 * Since the meta provider count is non-zero we don't have to worry
 	 * about this provider disappearing.
 	 */
 	ASSERT(provider->ftp_mcount > 0);
 
 	/*
 	 * The offsets must be unique.
 	 */
 	qsort(dhpb->dthpb_offs, dhpb->dthpb_noffs, sizeof (uint32_t),
 	    fasttrap_uint32_cmp);
 	for (i = 1; i < dhpb->dthpb_noffs; i++) {
 		if (dhpb->dthpb_base + dhpb->dthpb_offs[i] <=
 		    dhpb->dthpb_base + dhpb->dthpb_offs[i - 1])
 			return;
 	}
 
 	qsort(dhpb->dthpb_enoffs, dhpb->dthpb_nenoffs, sizeof (uint32_t),
 	    fasttrap_uint32_cmp);
 	for (i = 1; i < dhpb->dthpb_nenoffs; i++) {
 		if (dhpb->dthpb_base + dhpb->dthpb_enoffs[i] <=
 		    dhpb->dthpb_base + dhpb->dthpb_enoffs[i - 1])
 			return;
 	}
 
 	ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs;
 	ASSERT(ntps > 0);
 
 	atomic_add_32(&fasttrap_total, ntps);
 
 	if (fasttrap_total > fasttrap_max) {
 		atomic_add_32(&fasttrap_total, -ntps);
 		return;
 	}
 
 	pp = kmem_zalloc(offsetof(fasttrap_probe_t, ftp_tps[ntps]), KM_SLEEP);
 
 	pp->ftp_prov = provider;
 	pp->ftp_pid = provider->ftp_pid;
 	pp->ftp_ntps = ntps;
 	pp->ftp_nargs = dhpb->dthpb_xargc;
 	pp->ftp_xtypes = dhpb->dthpb_xtypes;
 	pp->ftp_ntypes = dhpb->dthpb_ntypes;
 
 	/*
 	 * First create a tracepoint for each actual point of interest.
 	 */
 	for (i = 0; i < dhpb->dthpb_noffs; i++) {
 		tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP);
 
 		tp->ftt_proc = provider->ftp_proc;
 		tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_offs[i];
 		tp->ftt_pid = provider->ftp_pid;
 
 		pp->ftp_tps[i].fit_tp = tp;
 		pp->ftp_tps[i].fit_id.fti_probe = pp;
 #ifdef __sparc
 		pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_POST_OFFSETS;
 #else
 		pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_OFFSETS;
 #endif
 	}
 
 	/*
 	 * Then create a tracepoint for each is-enabled point.
 	 */
 	for (j = 0; i < ntps; i++, j++) {
 		tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP);
 
 		tp->ftt_proc = provider->ftp_proc;
 		tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_enoffs[j];
 		tp->ftt_pid = provider->ftp_pid;
 
 		pp->ftp_tps[i].fit_tp = tp;
 		pp->ftp_tps[i].fit_id.fti_probe = pp;
 		pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_IS_ENABLED;
 	}
 
 	/*
 	 * If the arguments are shuffled around we set the argument remapping
 	 * table. Later, when the probe fires, we only remap the arguments
 	 * if the table is non-NULL.
 	 */
 	for (i = 0; i < dhpb->dthpb_xargc; i++) {
 		if (dhpb->dthpb_args[i] != i) {
 			pp->ftp_argmap = dhpb->dthpb_args;
 			break;
 		}
 	}
 
 	/*
 	 * The probe is fully constructed -- register it with DTrace.
 	 */
 	pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod,
 	    dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_meta_remove(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid)
 {
 	/*
 	 * Clean up the USDT provider. There may be active consumers of the
 	 * provider busy adding probes, no damage will actually befall the
 	 * provider until that count has dropped to zero. This just puts
 	 * the provider on death row.
 	 */
 	fasttrap_provider_retire(pid, dhpv->dthpv_provname, 1);
 }
 
 static dtrace_mops_t fasttrap_mops = {
 	.dtms_create_probe =	fasttrap_meta_create_probe,
 	.dtms_provide_pid =	fasttrap_meta_provide,
 	.dtms_remove_pid =	fasttrap_meta_remove
 };
 
 /*ARGSUSED*/
 static int
 fasttrap_open(struct cdev *dev __unused, int oflags __unused,
     int devtype __unused, struct thread *td __unused)
 {
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag,
     struct thread *td)
 {
 	if (!dtrace_attached())
 		return (EAGAIN);
 
 	if (cmd == FASTTRAPIOC_MAKEPROBE) {
 		fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg;
 		fasttrap_probe_spec_t *probe;
 		uint64_t noffs;
 		size_t size;
 		int ret, err;
 
 		if (copyin(&uprobe->ftps_noffs, &noffs,
 		    sizeof (uprobe->ftps_noffs)))
 			return (EFAULT);
 
 		/*
 		 * Probes must have at least one tracepoint.
 		 */
 		if (noffs == 0)
 			return (EINVAL);
 
 		size = sizeof (fasttrap_probe_spec_t) +
 		    sizeof (probe->ftps_offs[0]) * (noffs - 1);
 
 		if (size > 1024 * 1024)
 			return (ENOMEM);
 
 		probe = kmem_alloc(size, KM_SLEEP);
 
 		if (copyin(uprobe, probe, size) != 0 ||
 		    probe->ftps_noffs != noffs) {
 			kmem_free(probe, size);
 			return (EFAULT);
 		}
 
 		/*
 		 * Verify that the function and module strings contain no
 		 * funny characters.
 		 */
 		if (u8_validate(probe->ftps_func, strlen(probe->ftps_func),
 		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
 			ret = EINVAL;
 			goto err;
 		}
 
 		if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod),
 		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
 			ret = EINVAL;
 			goto err;
 		}
 
 #ifdef notyet
 		if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
 			proc_t *p;
 			pid_t pid = probe->ftps_pid;
 
 			mutex_enter(&pidlock);
 			/*
 			 * Report an error if the process doesn't exist
 			 * or is actively being birthed.
 			 */
 			if ((p = pfind(pid)) == NULL || p->p_stat == SIDL) {
 				mutex_exit(&pidlock);
 				return (ESRCH);
 			}
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
 
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD | VWRITE)) != 0) {
 				mutex_exit(&p->p_lock);
 				return (ret);
 			}
 			mutex_exit(&p->p_lock);
 		}
 #endif /* notyet */
 
 		ret = fasttrap_add_probe(probe);
 err:
 		kmem_free(probe, size);
 
 		return (ret);
 
 	} else if (cmd == FASTTRAPIOC_GETINSTR) {
 		fasttrap_instr_query_t instr;
 		fasttrap_tracepoint_t *tp;
 		uint_t index;
 #ifdef notyet
 		int ret;
 #endif
 
 #ifdef illumos
 		if (copyin((void *)arg, &instr, sizeof (instr)) != 0)
 			return (EFAULT);
 #endif
 
 #ifdef notyet
 		if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
 			proc_t *p;
 			pid_t pid = instr.ftiq_pid;
 
 			mutex_enter(&pidlock);
 			/*
 			 * Report an error if the process doesn't exist
 			 * or is actively being birthed.
 			 */
 			if ((p == pfind(pid)) == NULL || p->p_stat == SIDL) {
 				mutex_exit(&pidlock);
 				return (ESRCH);
 			}
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
 
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD)) != 0) {
 				mutex_exit(&p->p_lock);
 				return (ret);
 			}
 
 			mutex_exit(&p->p_lock);
 		}
 #endif /* notyet */
 
 		index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc);
 
 		mutex_enter(&fasttrap_tpoints.fth_table[index].ftb_mtx);
 		tp = fasttrap_tpoints.fth_table[index].ftb_data;
 		while (tp != NULL) {
 			if (instr.ftiq_pid == tp->ftt_pid &&
 			    instr.ftiq_pc == tp->ftt_pc &&
 			    tp->ftt_proc->ftpc_acount != 0)
 				break;
 
 			tp = tp->ftt_next;
 		}
 
 		if (tp == NULL) {
 			mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx);
 			return (ENOENT);
 		}
 
 		bcopy(&tp->ftt_instr, &instr.ftiq_instr,
 		    sizeof (instr.ftiq_instr));
 		mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx);
 
 		if (copyout(&instr, (void *)arg, sizeof (instr)) != 0)
 			return (EFAULT);
 
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 static int
 fasttrap_load(void)
 {
 	ulong_t nent;
 	int i, ret;
 
         /* Create the /dev/dtrace/fasttrap entry. */
         fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
             "dtrace/fasttrap");
 
 	mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF);
 	mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT,
 	    NULL);
 
 #ifdef illumos
 	fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT);
 #endif
 	fasttrap_total = 0;
 
 	/*
 	 * Conjure up the tracepoints hashtable...
 	 */
 #ifdef illumos
 	nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
 #else
 	nent = tpoints_hash_size;
 #endif
 
 	if (nent == 0 || nent > 0x1000000)
 		nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
 
 	tpoints_hash_size = nent;
 
 	if (ISP2(nent))
 		fasttrap_tpoints.fth_nent = nent;
 	else
 		fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent);
 	ASSERT(fasttrap_tpoints.fth_nent > 0);
 	fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1;
 	fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
 #ifndef illumos
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 		mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx,
 		    "tracepoints bucket mtx", MUTEX_DEFAULT, NULL);
 #endif
 
 	/*
 	 * ... and the providers hash table...
 	 */
 	nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE;
 	if (ISP2(nent))
 		fasttrap_provs.fth_nent = nent;
 	else
 		fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent);
 	ASSERT(fasttrap_provs.fth_nent > 0);
 	fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1;
 	fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
 #ifndef illumos
 	for (i = 0; i < fasttrap_provs.fth_nent; i++)
 		mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, 
 		    "providers bucket mtx", MUTEX_DEFAULT, NULL);
 #endif
 
 	ret = kproc_create(fasttrap_pid_cleanup_cb, NULL,
 	    &fasttrap_cleanup_proc, 0, 0, "ftcleanup");
 	if (ret != 0) {
 		destroy_dev(fasttrap_cdev);
 #ifndef illumos
 		for (i = 0; i < fasttrap_provs.fth_nent; i++)
 			mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
 		for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 			mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
 #endif
 		kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent *
 		    sizeof (fasttrap_bucket_t));
 		mtx_destroy(&fasttrap_cleanup_mtx);
 		mutex_destroy(&fasttrap_count_mtx);
 		return (ret);
 	}
 
 
 	/*
 	 * ... and the procs hash table.
 	 */
 	nent = FASTTRAP_PROCS_DEFAULT_SIZE;
 	if (ISP2(nent))
 		fasttrap_procs.fth_nent = nent;
 	else
 		fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent);
 	ASSERT(fasttrap_procs.fth_nent > 0);
 	fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1;
 	fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
 #ifndef illumos
 	for (i = 0; i < fasttrap_procs.fth_nent; i++)
 		mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx,
 		    "processes bucket mtx", MUTEX_DEFAULT, NULL);
 
 	rm_init(&fasttrap_tp_lock, "fasttrap tracepoint");
 
 	/*
 	 * This event handler must run before kdtrace_thread_dtor() since it
 	 * accesses the thread's struct kdtrace_thread.
 	 */
 	fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
 	    fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST);
 #endif
 
 	/*
 	 * Install our hooks into fork(2), exec(2), and exit(2).
 	 */
 	dtrace_fasttrap_fork = &fasttrap_fork;
 	dtrace_fasttrap_exit = &fasttrap_exec_exit;
 	dtrace_fasttrap_exec = &fasttrap_exec_exit;
 
 	(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 	    &fasttrap_meta_id);
 
 	return (0);
 }
 
 static int
 fasttrap_unload(void)
 {
 	int i, fail = 0;
 
 	/*
 	 * Unregister the meta-provider to make sure no new fasttrap-
 	 * managed providers come along while we're trying to close up
 	 * shop. If we fail to detach, we'll need to re-register as a
 	 * meta-provider. We can fail to unregister as a meta-provider
 	 * if providers we manage still exist.
 	 */
 	if (fasttrap_meta_id != DTRACE_METAPROVNONE &&
 	    dtrace_meta_unregister(fasttrap_meta_id) != 0)
 		return (-1);
 
 	/*
 	 * Iterate over all of our providers. If there's still a process
 	 * that corresponds to that pid, fail to detach.
 	 */
 	for (i = 0; i < fasttrap_provs.fth_nent; i++) {
 		fasttrap_provider_t **fpp, *fp;
 		fasttrap_bucket_t *bucket = &fasttrap_provs.fth_table[i];
 
 		mutex_enter(&bucket->ftb_mtx);
 		fpp = (fasttrap_provider_t **)&bucket->ftb_data;
 		while ((fp = *fpp) != NULL) {
 			/*
 			 * Acquire and release the lock as a simple way of
 			 * waiting for any other consumer to finish with
 			 * this provider. A thread must first acquire the
 			 * bucket lock so there's no chance of another thread
 			 * blocking on the provider's lock.
 			 */
 			mutex_enter(&fp->ftp_mtx);
 			mutex_exit(&fp->ftp_mtx);
 
 			if (dtrace_unregister(fp->ftp_provid) != 0) {
 				fail = 1;
 				fpp = &fp->ftp_next;
 			} else {
 				*fpp = fp->ftp_next;
 				fasttrap_provider_free(fp);
 			}
 		}
 
 		mutex_exit(&bucket->ftb_mtx);
 	}
 
 	if (fail) {
 		(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 		    &fasttrap_meta_id);
 
 		return (-1);
 	}
 
 	/*
 	 * Stop new processes from entering these hooks now, before the
 	 * fasttrap_cleanup thread runs.  That way all processes will hopefully
 	 * be out of these hooks before we free fasttrap_provs.fth_table
 	 */
 	ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
 	dtrace_fasttrap_fork = NULL;
 
 	ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
 	dtrace_fasttrap_exec = NULL;
 
 	ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
 	dtrace_fasttrap_exit = NULL;
 
 	mtx_lock(&fasttrap_cleanup_mtx);
 	fasttrap_cleanup_drain = 1;
 	/* Wait for the cleanup thread to finish up and signal us. */
 	wakeup(&fasttrap_cleanup_cv);
 	mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld",
 	    0);
 	fasttrap_cleanup_proc = NULL;
 	mtx_destroy(&fasttrap_cleanup_mtx);
 
 #ifdef DEBUG
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count == 0);
 	mutex_exit(&fasttrap_count_mtx);
 #endif
 
 #ifndef illumos
 	EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag);
 
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 		mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
 	for (i = 0; i < fasttrap_provs.fth_nent; i++)
 		mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
 	for (i = 0; i < fasttrap_procs.fth_nent; i++)
 		mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx);
 #endif
 	kmem_free(fasttrap_tpoints.fth_table,
 	    fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_tpoints.fth_nent = 0;
 
 	kmem_free(fasttrap_provs.fth_table,
 	    fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_provs.fth_nent = 0;
 
 	kmem_free(fasttrap_procs.fth_table,
 	    fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_procs.fth_nent = 0;
 
 #ifndef illumos
 	destroy_dev(fasttrap_cdev);
 	mutex_destroy(&fasttrap_count_mtx);
 	rm_destroy(&fasttrap_tp_lock);
 #endif
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 fasttrap_modevent(module_t mod __unused, int type, void *data __unused)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	case MOD_SHUTDOWN:
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load,
     NULL);
 SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
     fasttrap_unload, NULL);
 
 DEV_MODULE(fasttrap, fasttrap_modevent, NULL);
 MODULE_VERSION(fasttrap, 1);
 MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1);
 MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1);
Index: projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris
===================================================================
--- projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris	(revision 326162)

Property changes on: projects/bsd_rdma_4_9/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys/cddl/contrib/opensolaris:r326132-326161
Index: projects/bsd_rdma_4_9/sys/compat/linux/linux_emul.c
===================================================================
--- projects/bsd_rdma_4_9/sys/compat/linux/linux_emul.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/compat/linux/linux_emul.c	(revision 326162)
@@ -1,300 +1,300 @@
 /*-
  * Copyright (c) 2006 Roman Divacky
  * Copyright (c) 2013 Dmitry Chagin
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_persona.h>
 #include <compat/linux/linux_util.h>
 
 
 /*
  * This returns reference to the thread emuldata entry (if found)
  *
  * Hold PROC_LOCK when referencing emuldata from other threads.
  */
 struct linux_emuldata *
 em_find(struct thread *td)
 {
 	struct linux_emuldata *em;
 
 	em = td->td_emuldata;
 
 	return (em);
 }
 
 /*
  * This returns reference to the proc pemuldata entry (if found)
  *
  * Hold PROC_LOCK when referencing proc pemuldata from other threads.
  * Hold LINUX_PEM_LOCK wher referencing pemuldata members.
  */
 struct linux_pemuldata *
 pem_find(struct proc *p)
 {
 	struct linux_pemuldata *pem;
 
 	pem = p->p_emuldata;
 
 	return (pem);
 }
 
 void
 linux_proc_init(struct thread *td, struct thread *newtd, int flags)
 {
 	struct linux_emuldata *em;
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct proc *p;
 
 	if (newtd != NULL) {
 		p = newtd->td_proc;
 
 		/* non-exec call */
 		em = malloc(sizeof(*em), M_TEMP, M_WAITOK | M_ZERO);
 		if (flags & LINUX_CLONE_THREAD) {
 			LINUX_CTR1(proc_init, "thread newtd(%d)",
 			    newtd->td_tid);
 
 			em->em_tid = newtd->td_tid;
 		} else {
 			LINUX_CTR1(proc_init, "fork newtd(%d)", p->p_pid);
 
 			em->em_tid = p->p_pid;
 
 			pem = malloc(sizeof(*pem), M_LINUX, M_WAITOK | M_ZERO);
 			sx_init(&pem->pem_sx, "lpemlk");
 			p->p_emuldata = pem;
 		}
 		newtd->td_emuldata = em;
 	} else {
 		p = td->td_proc;
 
 		/* exec */
 		LINUX_CTR1(proc_init, "exec newtd(%d)", p->p_pid);
 
 		/* lookup the old one */
 		em = em_find(td);
 		KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n"));
 
 		em->em_tid = p->p_pid;
 		em->flags = 0;
 		em->pdeath_signal = 0;
 		em->robust_futexes = NULL;
 		em->child_clear_tid = NULL;
 		em->child_set_tid = NULL;
 
 		 /* epoll should be destroyed in a case of exec. */
 		pem = pem_find(p);
 		KASSERT(pem != NULL, ("proc_exit: proc emuldata not found.\n"));
 		pem->persona = 0;
 		if (pem->epoll != NULL) {
 			emd = pem->epoll;
 			pem->epoll = NULL;
 			free(emd, M_EPOLL);
 		}
 	}
 
 }
 
 void 
 linux_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct thread *td = curthread;
 
 	if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX))
 		return;
 
 	LINUX_CTR3(proc_exit, "thread(%d) proc(%d) p %p",
 	    td->td_tid, p->p_pid, p);
 
 	pem = pem_find(p);
 	if (pem == NULL)
 		return;	
 	(p->p_sysent->sv_thread_detach)(td);
 
 	p->p_emuldata = NULL;
 
 	if (pem->epoll != NULL) {
 		emd = pem->epoll;
 		pem->epoll = NULL;
 		free(emd, M_EPOLL);
 	}
 
 	sx_destroy(&pem->pem_sx);
 	free(pem, M_LINUX);
 }
 
 int 
 linux_common_execve(struct thread *td, struct image_args *eargs)
 {
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct vmspace *oldvmspace;
 	struct linux_emuldata *em;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 
 	error = kern_execve(td, eargs, NULL);
 	post_execve(td, error, oldvmspace);
-	if (error != 0)
+	if (error != EJUSTRETURN)
 		return (error);
 
 	/*
 	 * In a case of transition from Linux binary execing to
 	 * FreeBSD binary we destroy linux emuldata thread & proc entries.
 	 */
 	if (SV_CURPROC_ABI() != SV_ABI_LINUX) {
 		PROC_LOCK(p);
 		em = em_find(td);
 		KASSERT(em != NULL, ("proc_exec: thread emuldata not found.\n"));
 		td->td_emuldata = NULL;
 
 		pem = pem_find(p);
 		KASSERT(pem != NULL, ("proc_exec: proc pemuldata not found.\n"));
 		p->p_emuldata = NULL;
 		PROC_UNLOCK(p);
 
 		if (pem->epoll != NULL) {
 			emd = pem->epoll;
 			pem->epoll = NULL;
 			free(emd, M_EPOLL);
 		}
 
 		free(em, M_TEMP);
 		free(pem, M_LINUX);
 	}
-	return (0);
+	return (EJUSTRETURN);
 }
 
 void 
 linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp)
 {
 	struct thread *td = curthread;
 	struct thread *othertd;
 #if defined(__amd64__)
 	struct linux_pemuldata *pem;
 #endif
 
 	/*
 	 * In a case of execing from linux binary properly detach
 	 * other threads from the user space.
 	 */
 	if (__predict_false(SV_PROC_ABI(p) == SV_ABI_LINUX)) {
 		FOREACH_THREAD_IN_PROC(p, othertd) {
 			if (td != othertd)
 				(p->p_sysent->sv_thread_detach)(othertd);
 		}
 	}
 
 	/*
 	 * In a case of execing to linux binary we create linux
 	 * emuldata thread entry.
 	 */
 	if (__predict_false((imgp->sysent->sv_flags & SV_ABI_MASK) ==
 	    SV_ABI_LINUX)) {
 
 		if (SV_PROC_ABI(p) == SV_ABI_LINUX)
 			linux_proc_init(td, NULL, 0);
 		else
 			linux_proc_init(td, td, 0);
 #if defined(__amd64__)
 		/*
 		 * An IA32 executable which has executable stack will have the
 		 * READ_IMPLIES_EXEC personality flag set automatically.
 		 */
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
 		    imgp->stack_prot & VM_PROT_EXECUTE) {
 			pem = pem_find(p);
 			pem->persona |= LINUX_READ_IMPLIES_EXEC;
 		}
 #endif
 	}
 }
 
 void
 linux_thread_dtor(void *arg __unused, struct thread *td)
 {
 	struct linux_emuldata *em;
 
 	em = em_find(td);
 	if (em == NULL)
 		return;
 	td->td_emuldata = NULL;
 
 	LINUX_CTR1(thread_dtor, "thread(%d)", em->em_tid);
 
 	free(em, M_TEMP);
 }
 
 void
 linux_schedtail(struct thread *td)
 {
 	struct linux_emuldata *em;
 	struct proc *p;
 	int error = 0;
 	int *child_set_tid;
 
 	p = td->td_proc;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("linux_schedtail: thread emuldata not found.\n"));
 	child_set_tid = em->child_set_tid;
 
 	if (child_set_tid != NULL) {
 		error = copyout(&em->em_tid, child_set_tid,
 		    sizeof(em->em_tid));
 		LINUX_CTR4(schedtail, "thread(%d) %p stored %d error %d",
 		    td->td_tid, child_set_tid, em->em_tid, error);
 	} else
 		LINUX_CTR1(schedtail, "thread(%d)", em->em_tid);
 }
Index: projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.c
===================================================================
--- projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.c	(revision 326162)
@@ -1,3437 +1,3437 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fail.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/mirror/g_mirror.h>
 
 FEATURE(geom_mirror, "GEOM mirroring support");
 
 static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0,
     "GEOM_MIRROR stuff");
-u_int g_mirror_debug = 0;
-SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0,
+int g_mirror_debug = 0;
+SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0,
     "Debug level");
 static u_int g_mirror_timeout = 4;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout,
     0, "Time to wait on all mirror components");
 static u_int g_mirror_idletime = 5;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_mirror_idletime, 0, "Mark components as clean when idling");
 static u_int g_mirror_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_mirror_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_mirror_post_sync = NULL;
 static int g_mirror_shutdown = 0;
 
 static g_ctl_destroy_geom_t g_mirror_destroy_geom;
 static g_taste_t g_mirror_taste;
 static g_init_t g_mirror_init;
 static g_fini_t g_mirror_fini;
 static g_provgone_t g_mirror_providergone;
 static g_resize_t g_mirror_resize;
 
 struct g_class g_mirror_class = {
 	.name = G_MIRROR_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_mirror_config,
 	.taste = g_mirror_taste,
 	.destroy_geom = g_mirror_destroy_geom,
 	.init = g_mirror_init,
 	.fini = g_mirror_fini,
 	.providergone = g_mirror_providergone,
 	.resize = g_mirror_resize
 };
 
 
 static void g_mirror_destroy_provider(struct g_mirror_softc *sc);
 static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state);
 static void g_mirror_update_device(struct g_mirror_softc *sc, bool force);
 static void g_mirror_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type);
 static void g_mirror_register_request(struct bio *bp);
 static void g_mirror_sync_release(struct g_mirror_softc *sc);
 
 
 static const char *
 g_mirror_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NONE:
 		return ("NONE");
 	case G_MIRROR_DISK_STATE_NEW:
 		return ("NEW");
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_MIRROR_DISK_STATE_STALE:
 		return ("STALE");
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	case G_MIRROR_DISK_STATE_DESTROY:
 		return ("DESTROY");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		return ("RUNNING");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_get_diskname(struct g_mirror_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_mirror are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_mirror_event_free(struct g_mirror_event *ep)
 {
 
 	free(ep, M_MIRROR);
 }
 
 int
 g_mirror_event_send(void *arg, int state, int flags)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK);
 	G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_MIRROR_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 		return (0);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_mirror_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_mirror_event *
 g_mirror_event_get(struct g_mirror_softc *sc)
 {
 	struct g_mirror_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_mirror_event_cancel(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_mirror_ndisks(struct g_mirror_softc *sc, int state)
 {
 	struct g_mirror_disk *disk;
 	u_int n = 0;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (state == -1 || disk->d_state == state)
 			n++;
 	}
 	return (n);
 }
 
 /*
  * Find a disk in mirror by its disk ID.
  */
 static struct g_mirror_disk *
 g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id)
 {
 	struct g_mirror_disk *disk;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_id == id)
 			return (disk);
 	}
 	return (NULL);
 }
 
 static u_int
 g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_mirror_nrequests(sc, cp) > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_mirror_is_busy(sc, cp))
 		return;
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	cp->flags |= G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	g_topology_unlock();
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 
 	G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_mirror_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_mirror_disk *
 g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md, int *errorp)
 {
 	struct g_mirror_disk *disk;
 	int i, error;
 
 	disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO);
 	if (disk == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 	disk->d_softc = sc;
 	error = g_mirror_connect_disk(disk, pp);
 	if (error != 0)
 		goto fail;
 	disk->d_id = md->md_did;
 	disk->d_state = G_MIRROR_DISK_STATE_NONE;
 	disk->d_priority = md->md_priority;
 	disk->d_flags = md->md_dflags;
 	error = g_getattr("GEOM::candelete", disk->d_consumer, &i);
 	if (error == 0 && i != 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 fail:
 	if (errorp != NULL)
 		*errorp = error;
 	if (disk != NULL)
 		free(disk, M_MIRROR);
 	return (NULL);
 }
 
 static void
 g_mirror_destroy_disk(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_REMOVE(disk, d_next);
 	g_mirror_event_cancel(disk);
 	if (sc->sc_hint == disk)
 		sc->sc_hint = NULL;
 	switch (disk->d_state) {
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		g_mirror_sync_stop(disk, 1);
 		/* FALLTHROUGH */
 	case G_MIRROR_DISK_STATE_NEW:
 	case G_MIRROR_DISK_STATE_STALE:
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_mirror_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		free(disk, M_MIRROR);
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 	}
 }
 
 static void
 g_mirror_free_device(struct g_mirror_softc *sc)
 {
 
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	mtx_destroy(&sc->sc_done_mtx);
 	sx_destroy(&sc->sc_lock);
 	free(sc, M_MIRROR);
 }
 
 static void
 g_mirror_providergone(struct g_provider *pp)
 {
 	struct g_mirror_softc *sc = pp->private;
 
 	if ((--sc->sc_refcnt) == 0)
 		g_mirror_free_device(sc);
 }
 
 static void
 g_mirror_destroy_device(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	struct g_geom *gp;
 	struct g_consumer *cp, *tmpcp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_mirror_destroy_provider(sc);
 	for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL;
 	    disk = LIST_FIRST(&sc->sc_disks)) {
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 		g_mirror_destroy_disk(disk);
 	}
 	while ((ep = g_mirror_event_get(sc)) != NULL) {
 		g_mirror_event_remove(sc, ep);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_MIRROR_EVENT_DONE;
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 
 	g_topology_lock();
 	LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) {
 		g_mirror_disconnect_consumer(sc, cp);
 	}
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	sx_xunlock(&sc->sc_lock);
 	if ((--sc->sc_refcnt) == 0)
 		g_mirror_free_device(sc);
 	g_topology_unlock();
 }
 
 static void
 g_mirror_orphan(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
 	    G_MIRROR_EVENT_DONTWAIT);
 }
 
 /*
  * Function should return the next active disk on the list.
  * It is possible that it will be the same disk as given.
  * If there are no active disks on list, NULL is returned.
  */
 static __inline struct g_mirror_disk *
 g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 	struct g_mirror_disk *dp;
 
 	for (dp = LIST_NEXT(disk, d_next); dp != disk;
 	    dp = LIST_NEXT(dp, d_next)) {
 		if (dp == NULL)
 			dp = LIST_FIRST(&sc->sc_disks);
 		if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 		return (NULL);
 	return (dp);
 }
 
 static struct g_mirror_disk *
 g_mirror_get_disk(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	if (sc->sc_hint == NULL) {
 		sc->sc_hint = LIST_FIRST(&sc->sc_disks);
 		if (sc->sc_hint == NULL)
 			return (NULL);
 	}
 	disk = sc->sc_hint;
 	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) {
 		disk = g_mirror_find_next(sc, disk);
 		if (disk == NULL)
 			return (NULL);
 	}
 	sc->sc_hint = g_mirror_find_next(sc, disk);
 	return (disk);
 }
 
 static int
 g_mirror_write_metadata(struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO);
 	if (md != NULL &&
 	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) {
 		/*
 		 * Handle the case, when the size of parent provider reduced.
 		 */
 		if (offset < md->md_mediasize)
 			error = ENOSPC;
 		else
 			mirror_metadata_encode(md, sector);
 	}
 	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error);
 	if (error == 0)
 		error = g_write_data(cp, offset, sector, length);
 	free(sector, M_MIRROR);
 	if (error != 0) {
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 			G_MIRROR_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		} else {
 			G_MIRROR_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_mirror_disconnect_on_failure &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 static int
 g_mirror_clear_metadata(struct g_mirror_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
 		return (0);
 	error = g_mirror_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s cleared.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 
 	strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_MIRROR_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_mid = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_slice = sc->sc_slice;
 	md->md_balance = sc->sc_balance;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK);
 	bzero(md->md_provider, sizeof(md->md_provider));
 	if (disk == NULL) {
 		md->md_did = arc4random();
 		md->md_priority = 0;
 		md->md_syncid = 0;
 		md->md_dflags = 0;
 		md->md_sync_offset = 0;
 		md->md_provsize = 0;
 	} else {
 		md->md_did = disk->d_id;
 		md->md_priority = disk->d_priority;
 		md->md_syncid = disk->d_sync.ds_syncid;
 		md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			md->md_sync_offset = disk->d_sync.ds_offset_done;
 		else
 			md->md_sync_offset = 0;
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) {
 			strlcpy(md->md_provider,
 			    disk->d_consumer->provider->name,
 			    sizeof(md->md_provider));
 		}
 		md->md_provsize = disk->d_consumer->provider->mediasize;
 	}
 }
 
 void
 g_mirror_update_metadata(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
 		return;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0)
 		g_mirror_fill_metadata(sc, disk, &md);
 	error = g_mirror_write_metadata(disk, &md);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s updated.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 }
 
 static void
 g_mirror_bump_syncid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_mirror_bump_genid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_mirror_idle(struct g_mirror_softc *sc, int acw)
 {
 	struct g_mirror_disk *disk;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_mirror_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_unidle(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 }
 
 static void
 g_mirror_flush_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->private;
 	mtx_lock(&sc->sc_done_mtx);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_done_mtx);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_done_mtx);
 	g_destroy_bio(bp);
 }
 
 static void
 g_mirror_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_regular_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->private;
 	bp->bio_from->index--;
 	if (bp->bio_cmd == BIO_WRITE)
 		sc->sc_writes--;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 	}
 
 	if (bp->bio_cmd == BIO_READ)
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read,
 		    bp->bio_error);
 	else if (bp->bio_cmd == BIO_WRITE)
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write,
 		    bp->bio_error);
 
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (bp->bio_error == 0 && pbp->bio_error == 0) {
 		G_MIRROR_LOGREQ(3, bp, "Request delivered.");
 		g_destroy_bio(bp);
 		if (pbp->bio_children == pbp->bio_inbed) {
 			G_MIRROR_LOGREQ(3, pbp, "Request delivered.");
 			pbp->bio_completed = pbp->bio_length;
 			if (pbp->bio_cmd == BIO_WRITE ||
 			    pbp->bio_cmd == BIO_DELETE) {
 				bioq_remove(&sc->sc_inflight, pbp);
 				/* Release delayed sync requests if possible. */
 				g_mirror_sync_release(sc);
 			}
 			g_io_deliver(pbp, pbp->bio_error);
 		}
 		return;
 	} else if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		if (disk != NULL) {
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 				disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 				G_MIRROR_LOGREQ(0, bp,
 				    "Request failed (error=%d).",
 				    bp->bio_error);
 			} else {
 				G_MIRROR_LOGREQ(1, bp,
 				    "Request failed (error=%d).",
 				    bp->bio_error);
 			}
 			if (g_mirror_disconnect_on_failure &&
 			    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1)
 			{
 				if (bp->bio_error == ENXIO &&
 				    bp->bio_cmd == BIO_READ)
 					sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 				else if (bp->bio_error == ENXIO)
 					sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW;
 				else
 					sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 				g_mirror_event_send(disk,
 				    G_MIRROR_DISK_STATE_DISCONNECTED,
 				    G_MIRROR_EVENT_DONTWAIT);
 			}
 		}
 		switch (pbp->bio_cmd) {
 		case BIO_DELETE:
 		case BIO_WRITE:
 			pbp->bio_inbed--;
 			pbp->bio_children--;
 			break;
 		}
 	}
 	g_destroy_bio(bp);
 
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (pbp->bio_inbed < pbp->bio_children)
 			break;
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1)
 			g_io_deliver(pbp, pbp->bio_error);
 		else {
 			pbp->bio_error = 0;
 			mtx_lock(&sc->sc_queue_mtx);
 			bioq_insert_tail(&sc->sc_queue, pbp);
 			mtx_unlock(&sc->sc_queue_mtx);
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 			wakeup(sc);
 		}
 		break;
 	case BIO_DELETE:
 	case BIO_WRITE:
 		if (pbp->bio_children == 0) {
 			/*
 			 * All requests failed.
 			 */
 		} else if (pbp->bio_inbed < pbp->bio_children) {
 			/* Do nothing. */
 			break;
 		} else if (pbp->bio_children == pbp->bio_inbed) {
 			/* Some requests succeeded. */
 			pbp->bio_error = 0;
 			pbp->bio_completed = pbp->bio_length;
 		}
 		bioq_remove(&sc->sc_inflight, pbp);
 		/* Release delayed sync requests if possible. */
 		g_mirror_sync_release(sc);
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd));
 		break;
 	}
 }
 
 static void
 g_mirror_sync_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_candelete(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	int *val;
 
 	sc = bp->bio_to->private;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE)
 			break;
 	}
 	val = (int *)bp->bio_data;
 	*val = (disk != NULL);
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_mirror_kernel_dump(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *cbp;
 	struct g_kerneldump *gkd;
 
 	/*
 	 * We configure dumping to the first component, because this component
 	 * will be used for reading with 'prefer' balance algorithm.
 	 * If the component with the highest priority is currently disconnected
 	 * we will not be able to read the dump after the reboot if it will be
 	 * connected and synchronized later. Can we do something better?
 	 */
 	sc = bp->bio_to->private;
 	disk = LIST_FIRST(&sc->sc_disks);
 
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	if (gkd->length > bp->bio_to->mediasize)
 		gkd->length = bp->bio_to->mediasize;
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	g_io_request(cbp, disk->d_consumer);
 	G_MIRROR_DEBUG(1, "Kernel dump will go to %s.",
 	    g_mirror_get_diskname(disk));
 }
 
 static void
 g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	bioq_init(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_mirror_flush_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_start(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->private;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_mirror_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Provider's error should be set (error=%d)(mirror=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_MIRROR_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_mirror_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 		if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
 			g_mirror_candelete(bp);
 			return;
 		} else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
 			g_mirror_kernel_dump(bp);
 			return;
 		}
 		/* FALLTHROUGH */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	if (bp->bio_to->error != 0) {
 		mtx_unlock(&sc->sc_queue_mtx);
 		g_io_deliver(bp, bp->bio_to->error);
 		return;
 	}
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static int
 g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	u_int i;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (0);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			continue;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			sbp = disk->d_sync.ds_bios[i];
 			if (sbp == NULL)
 				continue;
 			sstart = sbp->bio_offset;
 			send = sbp->bio_offset + sbp->bio_length;
 			if (rend > sstart && rstart < send)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static int
 g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (0);
 	sstart = sbp->bio_offset;
 	send = sbp->bio_offset + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Puts request onto delayed queue.
  */
 static void
 g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying request.");
 	bioq_insert_head(&sc->sc_regular_delayed, bp);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request.");
 	bioq_insert_tail(&sc->sc_sync_delayed, bp);
 }
 
 /*
  * Releases delayed regular requests which don't collide anymore with sync
  * requests.
  */
 static void
 g_mirror_regular_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
 		if (g_mirror_sync_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_regular_delayed, bp);
 		G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_head(&sc->sc_queue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_mirror_sync_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
 		if (g_mirror_regular_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_sync_delayed, bp);
 		G_MIRROR_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Free a synchronization request and clear its slot in the array.
  */
 static void
 g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp)
 {
 	int idx;
 
 	if (disk != NULL && disk->d_sync.ds_bios != NULL) {
 		idx = (int)(uintptr_t)bp->bio_caller1;
 		KASSERT(disk->d_sync.ds_bios[idx] == bp,
 		    ("unexpected sync BIO at %p:%d", disk, idx));
 		disk->d_sync.ds_bios[idx] = NULL;
 	}
 	free(bp->bio_data, M_MIRROR);
 	g_destroy_bio(bp);
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is two-steps process: first, READ request is
  * send to active provider and then WRITE request (with read data) to the provider
  * being synchronized. When WRITE is finished, new synchronization request is
  * send.
  */
 static void
 g_mirror_sync_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct g_mirror_disk_sync *sync;
 
 	bp->bio_from->index--;
 	sc = bp->bio_from->geom->softc;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		g_mirror_sync_request_free(NULL, bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	    {
 		struct g_consumer *cp;
 
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read,
 		    bp->bio_error);
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_mirror_sync_request_free(disk, bp);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp,
 		    "Synchronization request half-finished.");
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	    }
 	case BIO_WRITE:
 	    {
 		off_t offset;
 		void *data;
 		int i, idx;
 
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write,
 		    bp->bio_error);
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_mirror_sync_request_free(disk, bp);
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp, "Synchronization request finished.");
 		sync = &disk->d_sync;
 		if (sync->ds_offset >= sc->sc_mediasize ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			g_mirror_sync_request_free(disk, bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/* Disk up-to-date, activate it. */
 			g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		data = bp->bio_data;
 		idx = (int)(uintptr_t)bp->bio_caller1;
 		g_reset_bio(bp);
 		bp->bio_cmd = BIO_READ;
 		bp->bio_offset = sync->ds_offset;
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		sync->ds_offset += bp->bio_length;
 		bp->bio_done = g_mirror_sync_done;
 		bp->bio_data = data;
 		bp->bio_from = sync->ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)idx;
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Release delayed requests if possible. */
 		g_mirror_regular_release(sc);
 
 		/* Find the smallest offset */
 		offset = sc->sc_mediasize;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			if (bp != NULL && bp->bio_offset < offset)
 				offset = bp->bio_offset;
 		}
 		if (sync->ds_offset_done + (MAXPHYS * 100) < offset) {
 			/* Update offset_done on every 100 blocks. */
 			sync->ds_offset_done = offset;
 			g_mirror_update_metadata(disk);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static void
 g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	disk = g_mirror_get_disk(sc);
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 #define TRACK_SIZE  (1 * 1024 * 1024)
 #define LOAD_SCALE	256
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 
 static void
 g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk, *dp;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	int prio, best;
 
 	/* Find a disk with the smallest load. */
 	disk = NULL;
 	best = INT_MAX;
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		prio = dp->load;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (dp->d_last_offset == bp->bio_offset)
 			prio -= 2 * LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE)
 			prio -= 1 * LOAD_SCALE;
 		if (prio <= best) {
 			disk = dp;
 			best = prio;
 		}
 	}
 	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	/* Remember last head position */
 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
 	/* Update loads. */
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		dp->load = (dp->d_consumer->index * LOAD_SCALE +
 		    dp->load * 7) / 8;
 	}
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	off_t left, mod, offset, slice;
 	u_char *data;
 	u_int ndisks;
 
 	if (bp->bio_length <= sc->sc_slice) {
 		g_mirror_request_round_robin(sc, bp);
 		return;
 	}
 	ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE);
 	slice = bp->bio_length / ndisks;
 	mod = slice % sc->sc_provider->sectorsize;
 	if (mod != 0)
 		slice += sc->sc_provider->sectorsize - mod;
 	/*
 	 * Allocate all bios before sending any request, so we can
 	 * return ENOMEM in nice and clean way.
 	 */
 	left = bp->bio_length;
 	offset = bp->bio_offset;
 	data = bp->bio_data;
 	bioq_init(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_mirror_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 		cbp->bio_offset = offset;
 		cbp->bio_data = data;
 		cbp->bio_length = MIN(left, slice);
 		left -= cbp->bio_length;
 		if (left == 0)
 			break;
 		offset += cbp->bio_length;
 		data += cbp->bio_length;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		disk->d_consumer->index++;
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_register_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->private;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		switch (sc->sc_balance) {
 		case G_MIRROR_BALANCE_LOAD:
 			g_mirror_request_load(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_PREFER:
 			g_mirror_request_prefer(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_ROUND_ROBIN:
 			g_mirror_request_round_robin(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_SPLIT:
 			g_mirror_request_split(sc, bp);
 			break;
 		}
 		return;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	    {
 		struct g_mirror_disk *disk;
 		struct g_mirror_disk_sync *sync;
 		struct bio_queue_head queue;
 		struct g_consumer *cp;
 		struct bio *cbp;
 
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_mirror_sync_collision(sc, bp)) {
 			g_mirror_regular_delay(sc, bp);
 			return;
 		}
 
 		if (sc->sc_idle)
 			g_mirror_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 			g_mirror_bump_syncid(sc);
 		}
 
 		/*
 		 * Allocate all bios before sending any request, so we can
 		 * return ENOMEM in nice and clean way.
 		 */
 		bioq_init(&queue);
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			sync = &disk->d_sync;
 			switch (disk->d_state) {
 			case G_MIRROR_DISK_STATE_ACTIVE:
 				break;
 			case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 				if (bp->bio_offset >= sync->ds_offset)
 					continue;
 				break;
 			default:
 				continue;
 			}
 			if (bp->bio_cmd == BIO_DELETE &&
 			    (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0)
 				continue;
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				while ((cbp = bioq_takefirst(&queue)) != NULL)
 					g_destroy_bio(cbp);
 				if (bp->bio_error == 0)
 					bp->bio_error = ENOMEM;
 				g_io_deliver(bp, bp->bio_error);
 				return;
 			}
 			bioq_insert_tail(&queue, cbp);
 			cbp->bio_done = g_mirror_done;
 			cp = disk->d_consumer;
 			cbp->bio_caller1 = cp;
 			cbp->bio_to = cp->provider;
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 		}
 		if (bioq_first(&queue) == NULL) {
 			g_io_deliver(bp, EOPNOTSUPP);
 			return;
 		}
 		while ((cbp = bioq_takefirst(&queue)) != NULL) {
 			G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 			cp = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 			cp->index++;
 			sc->sc_writes++;
 			g_io_request(cbp, cp);
 		}
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		bioq_insert_tail(&sc->sc_inflight, bp);
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_mirror_can_destroy(struct g_mirror_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0)
 		return (0);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_mirror_try_destroy(struct g_mirror_softc *sc)
 {
 
 	if (sc->sc_rootmount != NULL) {
 		G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 	g_topology_lock();
 	if (!g_mirror_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) {
 		g_topology_unlock();
 		G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_mirror_destroy_device(sc);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_mirror_worker(void *arg)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_MIRROR_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_mirror_event_get(sc);
 		if (ep != NULL) {
 			g_mirror_event_remove(sc, ep);
 			if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_MIRROR_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_mirror_update_device(sc, true);
 			} else {
 				/* Update disk status. */
 				G_MIRROR_DEBUG(3, "Running event for disk %s.",
 				     g_mirror_get_diskname(ep->e_disk));
 				ep->e_error = g_mirror_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_mirror_update_device(sc, false);
 			}
 			if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_mirror_event_free(ep);
 			} else {
 				ep->e_flags |= G_MIRROR_EVENT_DONE;
 				G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_mirror_idle(sc, -1);
 		/*
 		 * Now I/O requests.
 		 */
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = bioq_takefirst(&sc->sc_queue);
 		if (bp == NULL) {
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 				if (bioq_first(&sc->sc_queue) != NULL) {
 					mtx_unlock(&sc->sc_queue_mtx);
 					continue;
 				}
 			}
 			sx_xunlock(&sc->sc_lock);
 			/*
 			 * XXX: We can miss an event here, because an event
 			 *      can be added without sx-device-lock and without
 			 *      mtx-queue-lock. Maybe I should just stop using
 			 *      dedicated mutex for events synchronization and
 			 *      stick with the queue lock?
 			 *      The event will hang here until next I/O request
 			 *      or next event is received.
 			 */
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) {
 			g_mirror_sync_request(bp);	/* READ */
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0)
 				g_mirror_regular_request(bp);
 			else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				g_mirror_sync_request(bp);	/* WRITE */
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else {
 			g_mirror_register_request(bp);
 		}
 		G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_mirror_sync_start(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error, i;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Disk %s is not marked for synchronization.",
 	    g_mirror_get_diskname(disk)));
 	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Device not in RUNNING state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_mirror_get_diskname(disk));
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	KASSERT(disk->d_sync.ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_mirror_get_diskname(disk)));
 
 	disk->d_sync.ds_consumer = cp;
 	disk->d_sync.ds_consumer->private = disk;
 	disk->d_sync.ds_consumer->index = 0;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs,
 	    M_MIRROR, M_WAITOK);
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = g_alloc_bio();
 		disk->d_sync.ds_bios[i] = bp;
 		bp->bio_parent = NULL;
 		bp->bio_cmd = BIO_READ;
 		bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK);
 		bp->bio_cflags = 0;
 		bp->bio_offset = disk->d_sync.ds_offset;
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		disk->d_sync.ds_offset += bp->bio_length;
 		bp->bio_done = g_mirror_sync_done;
 		bp->bio_from = disk->d_sync.ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)i;
 	}
 
 	/* Increase the number of disks in SYNCHRONIZING state. */
 	sc->sc_sync.ds_ndisks++;
 	/* Set the number of in-flight synchronization requests. */
 	disk->d_sync.ds_inflight = g_mirror_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = disk->d_sync.ds_bios[i];
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		disk->d_sync.ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, disk->d_sync.ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_mirror_sync_stop(struct g_mirror_disk *disk, int type)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 	    g_mirror_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	}
 	g_mirror_regular_release(sc);
 	free(disk->d_sync.ds_bios, M_MIRROR);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	sc->sc_sync.ds_ndisks--;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_mirror_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_mirror_launch_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_provider *pp, *dp;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name);
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 
 	/* Splitting of unmapped BIO's could work but isn't implemented now */
 	if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_consumer && disk->d_consumer->provider) {
 			dp = disk->d_consumer->provider;
 			if (dp->stripesize > pp->stripesize) {
 				pp->stripesize = dp->stripesize;
 				pp->stripeoffset = dp->stripeoffset;
 			}
 			/* A provider underneath us doesn't support unmapped */
 			if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 				G_MIRROR_DEBUG(0, "Cancelling unmapped "
 				    "because of %s.", dp->name);
 				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 			}
 		}
 	}
 	pp->private = sc;
 	sc->sc_refcnt++;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_start(disk);
 	}
 }
 
 static void
 g_mirror_destroy_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_stop(disk, 1);
 	}
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
 		/*
 		 * Abort any pending I/O that wasn't generated by us.
 		 * Synchronization requests and requests destined for individual
 		 * mirror components can be destroyed immediately.
 		 */
 		if (bp->bio_to == sc->sc_provider &&
 		    bp->bio_from->geom != sc->sc_sync.ds_geom) {
 			g_io_deliver(bp, ENXIO);
 		} else {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				free(bp->bio_data, M_MIRROR);
 			g_destroy_bio(bp);
 		}
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	g_wither_provider(sc->sc_provider, ENXIO);
 	sc->sc_provider = NULL;
 	G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name);
 	g_topology_unlock();
 }
 
 static void
 g_mirror_go(void *arg)
 {
 	struct g_mirror_softc *sc;
 
 	sc = arg;
 	G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_mirror_event_send(sc, 0,
 	    G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE);
 }
 
 static u_int
 g_mirror_determine_state(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 &&
 		    (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 ||
 		     (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) {
 			/* Disk does not need synchronization. */
 			state = G_MIRROR_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_MIRROR_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_MIRROR_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that mirror was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, mirror is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_MIRROR_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_mirror_get_diskname(disk));
 		g_mirror_destroy_disk(disk);
 		state = G_MIRROR_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_MIRROR_DEBUG(3, "State for %s disk: %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_mirror_update_device(struct g_mirror_softc *sc, bool force)
 {
 	struct g_mirror_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 	    {
 		struct g_mirror_disk *pdisk, *tdisk;
 		u_int dirty, ndisks, genid, syncid;
 		bool broken;
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? We are, if all disks are connected or
 		 * if we have any disks and 'force' is true.
 		 */
 		ndisks = g_mirror_ndisks(sc, -1);
 		if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) {
 			;
 		} else if (ndisks == 0) {
 			/*
 			 * Disks went down in starting phase, so destroy
 			 * device.
 			 */
 			callout_drain(&sc->sc_callout);
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 			return;
 		} else {
 			return;
 		}
 
 		/*
 		 * Activate all disks with the biggest syncid.
 		 */
 		if (force) {
 			/*
 			 * If 'force' is true, we have been called due to
 			 * timeout, so don't bother canceling timeout.
 			 */
 			ndisks = 0;
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
 					ndisks++;
 				}
 			}
 			if (ndisks == 0) {
 				/* No valid disks found, destroy device. */
 				sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 				return;
 			}
 		} else {
 			/* Cancel timeout. */
 			callout_drain(&sc->sc_callout);
 		}
 
 		/*
 		 * Find the biggest genid.
 		 */
 		genid = 0;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_genid > genid)
 				genid = disk->d_genid;
 		}
 		sc->sc_genid = genid;
 		/*
 		 * Remove all disks without the biggest genid.
 		 */
 		broken = false;
 		LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
 			if (disk->d_genid < genid) {
 				G_MIRROR_DEBUG(0,
 				    "Component %s (device %s) broken, skipping.",
 				    g_mirror_get_diskname(disk), sc->sc_name);
 				g_mirror_destroy_disk(disk);
 				/*
 				 * Bump the syncid in case we discover a healthy
 				 * replacement disk after starting the mirror.
 				 */
 				broken = true;
 			}
 		}
 
 		/*
 		 * Find the biggest syncid.
 		 */
 		syncid = 0;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid > syncid)
 				syncid = disk->d_sync.ds_syncid;
 		}
 
 		/*
 		 * Here we need to look for dirty disks and if all disks
 		 * with the biggest syncid are dirty, we have to choose
 		 * one with the biggest priority and rebuild the rest.
 		 */
 		/*
 		 * Find the number of dirty disks with the biggest syncid.
 		 * Find the number of disks with the biggest syncid.
 		 * While here, find a disk with the biggest priority.
 		 */
 		dirty = ndisks = 0;
 		pdisk = NULL;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid != syncid)
 				continue;
 			if ((disk->d_flags &
 			    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 				dirty++;
 				if (pdisk == NULL ||
 				    pdisk->d_priority < disk->d_priority) {
 					pdisk = disk;
 				}
 			}
 		}
 		if (dirty == 0) {
 			/* No dirty disks at all, great. */
 		} else if (dirty == ndisks) {
 			/*
 			 * Force synchronization for all dirty disks except one
 			 * with the biggest priority.
 			 */
 			KASSERT(pdisk != NULL, ("pdisk == NULL"));
 			G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a "
 			    "master disk for synchronization.",
 			    g_mirror_get_diskname(pdisk), sc->sc_name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				KASSERT((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) != 0,
 				    ("Disk %s isn't marked as dirty.",
 				    g_mirror_get_diskname(disk)));
 				/* Skip the disk with the biggest priority. */
 				if (disk == pdisk)
 					continue;
 				disk->d_sync.ds_syncid = 0;
 			}
 		} else if (dirty < ndisks) {
 			/*
 			 * Force synchronization for all dirty disks.
 			 * We have some non-dirty disks.
 			 */
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_sync.ds_syncid = 0;
 			}
 		}
 
 		/* Reset hint. */
 		sc->sc_hint = NULL;
 		sc->sc_syncid = syncid;
 		if (force || broken) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		state = G_MIRROR_DEVICE_STATE_RUNNING;
 		G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_device_state2str(state));
 		sc->sc_state = state;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			state = g_mirror_determine_state(disk);
 			g_mirror_event_send(disk, state,
 			    G_MIRROR_EVENT_DONTWAIT);
 			if (state == G_MIRROR_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * No usable disks, so destroy the device.
 			 */
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			break;
 		} else if (g_mirror_ndisks(sc,
 		    G_MIRROR_DISK_STATE_ACTIVE) > 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * We have active disks, launch provider if it doesn't
 			 * exist.
 			 */
 			if (sc->sc_provider == NULL)
 				g_mirror_launch_provider(sc);
 			if (sc->sc_rootmount != NULL) {
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 		}
 		/*
 		 * Genid should be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID;
 			g_mirror_bump_genid(sc);
 		}
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW;
 			g_mirror_bump_syncid(sc);
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_MIRROR_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_mirror_get_diskname(disk),					\
 	g_mirror_disk_state2str(disk->d_state),				\
 	g_mirror_disk_state2str(state), sc->sc_name)
 static int
 g_mirror_update_disk(struct g_mirror_disk *disk, u_int state)
 {
 	struct g_mirror_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state),
 	    g_mirror_disk_state2str(state));
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		if (LIST_EMPTY(&sc->sc_disks))
 			LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next);
 		else {
 			struct g_mirror_disk *dp;
 
 			LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 				if (disk->d_priority >= dp->d_priority) {
 					LIST_INSERT_BEFORE(dp, disk, d_next);
 					dp = NULL;
 					break;
 				}
 				if (LIST_NEXT(dp, d_next) == NULL)
 					break;
 			}
 			if (dp != NULL)
 				LIST_INSERT_AFTER(dp, disk, d_next);
 		}
 		G_MIRROR_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		state = g_mirror_determine_state(disk);
 		if (state != G_MIRROR_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC;
 			g_mirror_sync_stop(disk, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_mirror_update_idle(sc, disk);
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_NEW)
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_mirror_sync_start(disk);
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_STALE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_mirror_device_state2str(sc->sc_state),
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		break;
 	case G_MIRROR_DISK_STATE_DESTROY:
 	    {
 		int error;
 
 		error = g_mirror_clear_metadata(disk);
 		if (error != 0) {
 			G_MIRROR_DEBUG(0,
 			    "Device %s: failed to clear metadata on %s: %d.",
 			    sc->sc_name, g_mirror_get_diskname(disk), error);
 			break;
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		sc->sc_ndisks--;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	    }
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = mirror_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 
 	if (g_mirror_id2disk(sc, md->md_did) != NULL) {
 		G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.",
 		    pp->name, md->md_did);
 		return (EEXIST);
 	}
 	if (md->md_all != sc->sc_ndisks) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_all", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_slice != sc->sc_slice) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_slice", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_balance != sc->sc_balance) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_balance", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 #if 0
 	if (md->md_mediasize != sc->sc_mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 #endif
 	if (sc->sc_mediasize > pp->mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_mirror_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 	if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING &&
 	    md->md_genid < sc->sc_genid) {
 		G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	disk = g_mirror_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW,
 	    G_MIRROR_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_MIRROR_VERSION);
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_delayed(void *arg, int flag)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_MIRROR_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0,
 	    ("CLOSEWAIT flag not set on %s.", sc->sc_name));
 	G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name);
 	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).",
 		    sc->sc_name, error);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_mirror_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_mirror_softc *sc;
 	int error = 0;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->private;
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 ||
 	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 ||
 	    LIST_EMPTY(&sc->sc_disks)) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	sc->sc_provider_open += acr + acw + ace;
 	if (pp->acw + acw == 0)
 		g_mirror_idle(sc, 0);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 &&
 	    sc->sc_provider_open == 0)
 		g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL);
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 struct g_geom *
 g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md,
     u_int type)
 {
 	struct g_mirror_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_mid);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO);
 	gp->start = g_mirror_start;
 	gp->orphan = g_mirror_orphan;
 	gp->access = g_mirror_access;
 	gp->dumpconf = g_mirror_dumpconf;
 
 	sc->sc_type = type;
 	sc->sc_id = md->md_mid;
 	sc->sc_slice = md->md_slice;
 	sc->sc_balance = md->md_balance;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_flags = md->md_mflags;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	sc->sc_refcnt = 1;
 	sx_init(&sc->sc_lock, "gmirror:lock");
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
 	bioq_init(&sc->sc_regular_delayed);
 	bioq_init(&sc->sc_inflight);
 	bioq_init(&sc->sc_sync_delayed);
 	LIST_INIT(&sc->sc_disks);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF);
 	sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	sc->sc_provider_open = 0;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_mirror_orphan;
 	sc->sc_sync.ds_geom = gp;
 	sc->sc_sync.ds_ndisks = 0;
 	error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_mirror %s", md->md_name);
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		g_destroy_geom(sc->sc_geom);
 		g_mirror_free_device(sc);
 		return (NULL);
 	}
 
 	G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GMIRROR");
 	G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 	/*
 	 * Run timeout.
 	 */
 	timeout = g_mirror_timeout * hz;
 	callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_mirror_destroy(struct g_mirror_softc *sc, int how)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider_open != 0) {
 		switch (how) {
 		case G_MIRROR_DESTROY_SOFT:
 			G_MIRROR_DEBUG(1,
 			    "Device %s is still open (%d).", sc->sc_name,
 			    sc->sc_provider_open);
 			return (EBUSY);
 		case G_MIRROR_DESTROY_DELAYED:
 			G_MIRROR_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    sc->sc_name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_state ==
 				    G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 					g_mirror_sync_stop(disk, 1);
 				}
 			}
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT;
 			return (EBUSY);
 		case G_MIRROR_DESTROY_HARD:
 			G_MIRROR_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", sc->sc_name);
 		}
 	}
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 		sx_xunlock(&sc->sc_lock);
 		return (0);
 	}
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN;
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5);
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_mirror_destroy_device(sc);
 	return (0);
 }
 
 static void
 g_mirror_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_mirror_metadata md;
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_MIRROR_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "mirror:taste");
 	/*
 	 * This orphan function should be never called.
 	 */
 	gp->orphan = g_mirror_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_mirror_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) {
 		G_MIRROR_DEBUG(0,
 		    "Device %s: provider %s marked as inactive, skipping.",
 		    md.md_name, pp->name);
 		return (NULL);
 	}
 	if (g_mirror_debug >= 2)
 		mirror_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_mid != sc->sc_id) {
 			G_MIRROR_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC);
 		if (gp == NULL) {
 			G_MIRROR_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING;
 	error = g_mirror_add_disk(sc, pp, &md);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (LIST_EMPTY(&sc->sc_disks)) {
 			g_cancel_event(sc);
 			g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 		g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 		g_topology_lock();
 		return (NULL);
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static void
 g_mirror_resize(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name);
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	g_topology_unlock();
 	g_mirror_update_metadata(disk);
 	g_topology_lock();
 }
 
 static int
 g_mirror_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_mirror_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)disk->d_id);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_printf(sb, "0%%");
 			else {
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    sc->sc_provider->mediasize));
 			}
 			sbuf_printf(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0) {
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 			}
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent,
 		    disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Priority>%u</Priority>\n", indent,
 		    disk->d_priority);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_mirror_disk_state2str(disk->d_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<Type>", indent);
 		switch (sc->sc_type) {
 		case G_MIRROR_TYPE_AUTOMATIC:
 			sbuf_printf(sb, "AUTOMATIC");
 			break;
 		case G_MIRROR_TYPE_MANUAL:
 			sbuf_printf(sb, "MANUAL");
 			break;
 		default:
 			sbuf_printf(sb, "UNKNOWN");
 			break;
 		}
 		sbuf_printf(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Slice>%u</Slice>\n", indent,
 		    (u_int)sc->sc_slice);
 		sbuf_printf(sb, "%s<Balance>%s</Balance>\n", indent,
 		    balance_name(sc->sc_balance));
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			sbuf_printf(sb, "%s", "STARTING");
 		else if (sc->sc_ndisks ==
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE))
 			sbuf_printf(sb, "%s", "COMPLETE");
 		else
 			sbuf_printf(sb, "%s", "DEGRADED");
 		sbuf_printf(sb, "</State>\n");
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_mirror_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_mirror_softc *sc;
 	int error;
 
 	if (panicstr != NULL)
 		return;
 
 	mp = arg;
 	g_topology_lock();
 	g_mirror_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_mirror_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 }
 
 static void
 g_mirror_init(struct g_class *mp)
 {
 
 	g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_mirror_post_sync == NULL)
 		G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_mirror_fini(struct g_class *mp)
 {
 
 	if (g_mirror_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync);
 }
 
 DECLARE_GEOM_CLASS(g_mirror_class, g_mirror);
Index: projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.h
===================================================================
--- projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.h	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/geom/mirror/g_mirror.h	(revision 326162)
@@ -1,510 +1,510 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_G_MIRROR_H_
 #define	_G_MIRROR_H_
 
 #include <sys/endian.h>
 #include <sys/md5.h>
 
 #define	G_MIRROR_CLASS_NAME	"MIRROR"
 
 #define	G_MIRROR_MAGIC		"GEOM::MIRROR"
 /*
  * Version history:
  * 0 - Initial version number.
  * 1 - Added 'prefer' balance algorithm.
  * 2 - Added md_genid field to metadata.
  * 3 - Added md_provsize field to metadata.
  * 4 - Added 'no failure synchronization' flag.
  */
 #define	G_MIRROR_VERSION	4
 
 #define	G_MIRROR_BALANCE_NONE		0
 #define	G_MIRROR_BALANCE_ROUND_ROBIN	1
 #define	G_MIRROR_BALANCE_LOAD		2
 #define	G_MIRROR_BALANCE_SPLIT		3
 #define	G_MIRROR_BALANCE_PREFER		4
 #define	G_MIRROR_BALANCE_MIN		G_MIRROR_BALANCE_NONE
 #define	G_MIRROR_BALANCE_MAX		G_MIRROR_BALANCE_PREFER
 
 #define	G_MIRROR_DISK_FLAG_DIRTY		0x0000000000000001ULL
 #define	G_MIRROR_DISK_FLAG_SYNCHRONIZING	0x0000000000000002ULL
 #define	G_MIRROR_DISK_FLAG_FORCE_SYNC		0x0000000000000004ULL
 #define	G_MIRROR_DISK_FLAG_INACTIVE		0x0000000000000008ULL
 #define	G_MIRROR_DISK_FLAG_HARDCODED		0x0000000000000010ULL
 #define	G_MIRROR_DISK_FLAG_BROKEN		0x0000000000000020ULL
 #define	G_MIRROR_DISK_FLAG_CANDELETE		0x0000000000000040ULL
 #define	G_MIRROR_DISK_FLAG_MASK		(G_MIRROR_DISK_FLAG_DIRTY |	\
 					 G_MIRROR_DISK_FLAG_SYNCHRONIZING | \
 					 G_MIRROR_DISK_FLAG_FORCE_SYNC | \
 					 G_MIRROR_DISK_FLAG_INACTIVE | \
 					 G_MIRROR_DISK_FLAG_CANDELETE)
 
 #define	G_MIRROR_DEVICE_FLAG_NOAUTOSYNC	0x0000000000000001ULL
 #define	G_MIRROR_DEVICE_FLAG_NOFAILSYNC	0x0000000000000002ULL
 #define	G_MIRROR_DEVICE_FLAG_MASK	(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC | \
 					 G_MIRROR_DEVICE_FLAG_NOFAILSYNC)
 
 #ifdef _KERNEL
-extern u_int g_mirror_debug;
+extern int g_mirror_debug;
 
 #define	G_MIRROR_DEBUG(lvl, ...)	do {				\
 	if (g_mirror_debug >= (lvl)) {					\
 		printf("GEOM_MIRROR");					\
 		if (g_mirror_debug > 0)					\
 			printf("[%u]", lvl);				\
 		printf(": ");						\
 		printf(__VA_ARGS__);					\
 		printf("\n");						\
 	}								\
 } while (0)
 #define	G_MIRROR_LOGREQ(lvl, bp, ...)	do {				\
 	if (g_mirror_debug >= (lvl)) {					\
 		printf("GEOM_MIRROR");					\
 		if (g_mirror_debug > 0)					\
 			printf("[%u]", lvl);				\
 		printf(": ");						\
 		printf(__VA_ARGS__);					\
 		printf(" ");						\
 		g_print_bio(bp);					\
 		printf("\n");						\
 	}								\
 } while (0)
 
 #define	G_MIRROR_BIO_FLAG_REGULAR	0x01
 #define	G_MIRROR_BIO_FLAG_SYNC		0x02
 
 /*
  * Informations needed for synchronization.
  */
 struct g_mirror_disk_sync {
 	struct g_consumer *ds_consumer;	/* Consumer connected to our mirror. */
 	off_t		  ds_offset;	/* Offset of next request to send. */
 	off_t		  ds_offset_done; /* Offset of already synchronized
 					   region. */
 	u_int		  ds_syncid;	/* Disk's synchronization ID. */
 	u_int		  ds_inflight;	/* Number of in-flight sync requests. */
 	struct bio	**ds_bios;	/* BIOs for synchronization I/O. */
 };
 
 /*
  * Informations needed for synchronization.
  */
 struct g_mirror_device_sync {
 	struct g_geom	*ds_geom;	/* Synchronization geom. */
 	u_int		 ds_ndisks;	/* Number of disks in SYNCHRONIZING
 					   state. */
 };
 
 #define	G_MIRROR_DISK_STATE_NONE		0
 #define	G_MIRROR_DISK_STATE_NEW			1
 #define	G_MIRROR_DISK_STATE_ACTIVE		2
 #define	G_MIRROR_DISK_STATE_STALE		3
 #define	G_MIRROR_DISK_STATE_SYNCHRONIZING	4
 #define	G_MIRROR_DISK_STATE_DISCONNECTED	5
 #define	G_MIRROR_DISK_STATE_DESTROY		6
 struct g_mirror_disk {
 	uint32_t	 d_id;		/* Disk ID. */
 	struct g_consumer *d_consumer;	/* Consumer. */
 	struct g_mirror_softc	*d_softc; /* Back-pointer to softc. */
 	int		 d_state;	/* Disk state. */
 	u_int		 d_priority;	/* Disk priority. */
 	u_int		 load;		/* Averaged queue length */
 	off_t		 d_last_offset;	/* Last read offset */
 	uint64_t	 d_flags;	/* Additional flags. */
 	u_int		 d_genid;	/* Disk's generation ID. */
 	struct g_mirror_disk_sync d_sync;/* Sync information. */
 	LIST_ENTRY(g_mirror_disk) d_next;
 };
 #define	d_name	d_consumer->provider->name
 
 #define	G_MIRROR_EVENT_DONTWAIT	0x1
 #define	G_MIRROR_EVENT_WAIT	0x2
 #define	G_MIRROR_EVENT_DEVICE	0x4
 #define	G_MIRROR_EVENT_DONE	0x8
 struct g_mirror_event {
 	struct g_mirror_disk	*e_disk;
 	int			 e_state;
 	int			 e_flags;
 	int			 e_error;
 	TAILQ_ENTRY(g_mirror_event) e_next;
 };
 
 #define	G_MIRROR_DEVICE_FLAG_DESTROY	0x0100000000000000ULL
 #define	G_MIRROR_DEVICE_FLAG_DRAIN	0x0200000000000000ULL
 #define	G_MIRROR_DEVICE_FLAG_CLOSEWAIT	0x0400000000000000ULL
 #define	G_MIRROR_DEVICE_FLAG_TASTING	0x0800000000000000ULL
 #define	G_MIRROR_DEVICE_FLAG_WIPE	0x1000000000000000ULL
 
 #define	G_MIRROR_DEVICE_STATE_STARTING		0
 #define	G_MIRROR_DEVICE_STATE_RUNNING		1
 
 #define	G_MIRROR_TYPE_MANUAL	0
 #define	G_MIRROR_TYPE_AUTOMATIC	1
 
 /* Bump syncid on first write. */
 #define	G_MIRROR_BUMP_SYNCID		0x1
 /* Bump genid immediately. */
 #define	G_MIRROR_BUMP_GENID		0x2
 /* Bump syncid immediately. */
 #define	G_MIRROR_BUMP_SYNCID_NOW	0x4
 struct g_mirror_softc {
 	u_int		sc_type;	/* Device type (manual/automatic). */
 	u_int		sc_state;	/* Device state. */
 	uint32_t	sc_slice;	/* Slice size. */
 	uint8_t		sc_balance;	/* Balance algorithm. */
 	uint64_t	sc_mediasize;	/* Device size. */
 	uint32_t	sc_sectorsize;	/* Sector size. */
 	uint64_t	sc_flags;	/* Additional flags. */
 
 	struct g_geom	*sc_geom;
 	struct g_provider *sc_provider;
 	int		sc_provider_open;
 
 	uint32_t	sc_id;		/* Mirror unique ID. */
 
 	struct sx	 sc_lock;
 	struct bio_queue_head sc_queue;
 	struct mtx	 sc_queue_mtx;
 	struct proc	*sc_worker;
 	struct bio_queue_head sc_regular_delayed; /* Delayed I/O requests due
 						     collision with sync
 						     requests. */
 	struct bio_queue_head sc_inflight; /* In-flight regular write
 					      requests. */
 	struct bio_queue_head sc_sync_delayed; /* Delayed sync requests due
 						  collision with regular
 						  requests. */
 
 	LIST_HEAD(, g_mirror_disk) sc_disks;
 	u_int		sc_ndisks;	/* Number of disks. */
 	struct g_mirror_disk *sc_hint;
 
 	u_int		sc_genid;	/* Generation ID. */
 	u_int		sc_syncid;	/* Synchronization ID. */
 	int		sc_bump_id;
 	struct g_mirror_device_sync sc_sync;
 	int		sc_idle;	/* DIRTY flags removed. */
 	time_t		sc_last_write;
 	u_int		sc_writes;
 	u_int		sc_refcnt;	/* Number of softc references */
 
 	TAILQ_HEAD(, g_mirror_event) sc_events;
 	struct mtx	sc_events_mtx;
 
 	struct callout	sc_callout;
 
 	struct root_hold_token *sc_rootmount;
 
 	struct mtx	 sc_done_mtx;
 };
 #define	sc_name	sc_geom->name
 
 struct g_mirror_metadata;
 
 u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state);
 struct g_geom * g_mirror_create(struct g_class *mp,
     const struct g_mirror_metadata *md, u_int type);
 #define	G_MIRROR_DESTROY_SOFT		0
 #define	G_MIRROR_DESTROY_DELAYED	1
 #define	G_MIRROR_DESTROY_HARD		2
 int g_mirror_destroy(struct g_mirror_softc *sc, int how);
 int g_mirror_event_send(void *arg, int state, int flags);
 struct g_mirror_metadata;
 int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md);
 int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md);
 void g_mirror_fill_metadata(struct g_mirror_softc *sc,
     struct g_mirror_disk *disk, struct g_mirror_metadata *md);
 void g_mirror_update_metadata(struct g_mirror_disk *disk);
 
 g_ctl_req_t g_mirror_config;
 #endif	/* _KERNEL */
 
 struct g_mirror_metadata {
 	char		md_magic[16];	/* Magic value. */
 	uint32_t	md_version;	/* Version number. */
 	char		md_name[16];	/* Mirror name. */
 	uint32_t	md_mid;		/* Mirror unique ID. */
 	uint32_t	md_did;		/* Disk unique ID. */
 	uint8_t		md_all;		/* Number of disks in mirror. */
 	uint32_t	md_genid;	/* Generation ID. */
 	uint32_t	md_syncid;	/* Synchronization ID. */
 	uint8_t		md_priority;	/* Disk priority. */
 	uint32_t	md_slice;	/* Slice size. */
 	uint8_t		md_balance;	/* Balance type. */
 	uint64_t	md_mediasize;	/* Size of the smallest
 					   disk in mirror. */
 	uint32_t	md_sectorsize;	/* Sector size. */
 	uint64_t	md_sync_offset;	/* Synchronized offset. */
 	uint64_t	md_mflags;	/* Additional mirror flags. */
 	uint64_t	md_dflags;	/* Additional disk flags. */
 	char		md_provider[16]; /* Hardcoded provider. */
 	uint64_t	md_provsize;	/* Provider's size. */
 	u_char		md_hash[16];	/* MD5 hash. */
 };
 static __inline void
 mirror_metadata_encode(struct g_mirror_metadata *md, u_char *data)
 {
 	MD5_CTX ctx;
 
 	bcopy(md->md_magic, data, 16);
 	le32enc(data + 16, md->md_version);
 	bcopy(md->md_name, data + 20, 16);
 	le32enc(data + 36, md->md_mid);
 	le32enc(data + 40, md->md_did);
 	*(data + 44) = md->md_all;
 	le32enc(data + 45, md->md_genid);
 	le32enc(data + 49, md->md_syncid);
 	*(data + 53) = md->md_priority;
 	le32enc(data + 54, md->md_slice);
 	*(data + 58) = md->md_balance;
 	le64enc(data + 59, md->md_mediasize);
 	le32enc(data + 67, md->md_sectorsize);
 	le64enc(data + 71, md->md_sync_offset);
 	le64enc(data + 79, md->md_mflags);
 	le64enc(data + 87, md->md_dflags);
 	bcopy(md->md_provider, data + 95, 16);
 	le64enc(data + 111, md->md_provsize);
 	MD5Init(&ctx);
 	MD5Update(&ctx, data, 119);
 	MD5Final(md->md_hash, &ctx);
 	bcopy(md->md_hash, data + 119, 16);
 }
 static __inline int
 mirror_metadata_decode_v0v1(const u_char *data, struct g_mirror_metadata *md)
 {
 	MD5_CTX ctx;
 
 	bcopy(data + 20, md->md_name, 16);
 	md->md_mid = le32dec(data + 36);
 	md->md_did = le32dec(data + 40);
 	md->md_all = *(data + 44);
 	md->md_syncid = le32dec(data + 45);
 	md->md_priority = *(data + 49);
 	md->md_slice = le32dec(data + 50);
 	md->md_balance = *(data + 54);
 	md->md_mediasize = le64dec(data + 55);
 	md->md_sectorsize = le32dec(data + 63);
 	md->md_sync_offset = le64dec(data + 67);
 	md->md_mflags = le64dec(data + 75);
 	md->md_dflags = le64dec(data + 83);
 	bcopy(data + 91, md->md_provider, 16);
 	bcopy(data + 107, md->md_hash, 16);
 	MD5Init(&ctx);
 	MD5Update(&ctx, data, 107);
 	MD5Final(md->md_hash, &ctx);
 	if (bcmp(md->md_hash, data + 107, 16) != 0)
 		return (EINVAL);
 
 	/* New fields. */
 	md->md_genid = 0;
 	md->md_provsize = 0;
 
 	return (0);
 }
 static __inline int
 mirror_metadata_decode_v2(const u_char *data, struct g_mirror_metadata *md)
 {
 	MD5_CTX ctx;
 
 	bcopy(data + 20, md->md_name, 16);
 	md->md_mid = le32dec(data + 36);
 	md->md_did = le32dec(data + 40);
 	md->md_all = *(data + 44);
 	md->md_genid = le32dec(data + 45);
 	md->md_syncid = le32dec(data + 49);
 	md->md_priority = *(data + 53);
 	md->md_slice = le32dec(data + 54);
 	md->md_balance = *(data + 58);
 	md->md_mediasize = le64dec(data + 59);
 	md->md_sectorsize = le32dec(data + 67);
 	md->md_sync_offset = le64dec(data + 71);
 	md->md_mflags = le64dec(data + 79);
 	md->md_dflags = le64dec(data + 87);
 	bcopy(data + 95, md->md_provider, 16);
 	bcopy(data + 111, md->md_hash, 16);
 	MD5Init(&ctx);
 	MD5Update(&ctx, data, 111);
 	MD5Final(md->md_hash, &ctx);
 	if (bcmp(md->md_hash, data + 111, 16) != 0)
 		return (EINVAL);
 
 	/* New fields. */
 	md->md_provsize = 0;
 
 	return (0);
 }
 static __inline int
 mirror_metadata_decode_v3v4(const u_char *data, struct g_mirror_metadata *md)
 {
 	MD5_CTX ctx;
 
 	bcopy(data + 20, md->md_name, 16);
 	md->md_mid = le32dec(data + 36);
 	md->md_did = le32dec(data + 40);
 	md->md_all = *(data + 44);
 	md->md_genid = le32dec(data + 45);
 	md->md_syncid = le32dec(data + 49);
 	md->md_priority = *(data + 53);
 	md->md_slice = le32dec(data + 54);
 	md->md_balance = *(data + 58);
 	md->md_mediasize = le64dec(data + 59);
 	md->md_sectorsize = le32dec(data + 67);
 	md->md_sync_offset = le64dec(data + 71);
 	md->md_mflags = le64dec(data + 79);
 	md->md_dflags = le64dec(data + 87);
 	bcopy(data + 95, md->md_provider, 16);
 	md->md_provsize = le64dec(data + 111);
 	bcopy(data + 119, md->md_hash, 16);
 	MD5Init(&ctx);
 	MD5Update(&ctx, data, 119);
 	MD5Final(md->md_hash, &ctx);
 	if (bcmp(md->md_hash, data + 119, 16) != 0)
 		return (EINVAL);
 	return (0);
 }
 static __inline int
 mirror_metadata_decode(const u_char *data, struct g_mirror_metadata *md)
 {
 	int error;
 
 	bcopy(data, md->md_magic, 16);
 	md->md_version = le32dec(data + 16);
 	switch (md->md_version) {
 	case 0:
 	case 1:
 		error = mirror_metadata_decode_v0v1(data, md);
 		break;
 	case 2:
 		error = mirror_metadata_decode_v2(data, md);
 		break;
 	case 3:
 	case 4:
 		error = mirror_metadata_decode_v3v4(data, md);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static __inline const char *
 balance_name(u_int balance)
 {
 	static const char *algorithms[] = {
 		[G_MIRROR_BALANCE_NONE] = "none",
 		[G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin",
 		[G_MIRROR_BALANCE_LOAD] = "load",
 		[G_MIRROR_BALANCE_SPLIT] = "split",
 		[G_MIRROR_BALANCE_PREFER] = "prefer",
 		[G_MIRROR_BALANCE_MAX + 1] = "unknown"
 	};
 
 	if (balance > G_MIRROR_BALANCE_MAX)
 		balance = G_MIRROR_BALANCE_MAX + 1;
 
 	return (algorithms[balance]);
 }
 
 static __inline int
 balance_id(const char *name)
 {
 	static const char *algorithms[] = {
 		[G_MIRROR_BALANCE_NONE] = "none",
 		[G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin",
 		[G_MIRROR_BALANCE_LOAD] = "load",
 		[G_MIRROR_BALANCE_SPLIT] = "split",
 		[G_MIRROR_BALANCE_PREFER] = "prefer"
 	};
 	int n;
 
 	for (n = G_MIRROR_BALANCE_MIN; n <= G_MIRROR_BALANCE_MAX; n++) {
 		if (strcmp(name, algorithms[n]) == 0)
 			return (n);
 	}
 	return (-1);
 }
 
 static __inline void
 mirror_metadata_dump(const struct g_mirror_metadata *md)
 {
 	static const char hex[] = "0123456789abcdef";
 	char hash[16 * 2 + 1];
 	u_int i;
 
 	printf("     magic: %s\n", md->md_magic);
 	printf("   version: %u\n", (u_int)md->md_version);
 	printf("      name: %s\n", md->md_name);
 	printf("       mid: %u\n", (u_int)md->md_mid);
 	printf("       did: %u\n", (u_int)md->md_did);
 	printf("       all: %u\n", (u_int)md->md_all);
 	printf("     genid: %u\n", (u_int)md->md_genid);
 	printf("    syncid: %u\n", (u_int)md->md_syncid);
 	printf("  priority: %u\n", (u_int)md->md_priority);
 	printf("     slice: %u\n", (u_int)md->md_slice);
 	printf("   balance: %s\n", balance_name((u_int)md->md_balance));
 	printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize);
 	printf("sectorsize: %u\n", (u_int)md->md_sectorsize);
 	printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset);
 	printf("    mflags:");
 	if (md->md_mflags == 0)
 		printf(" NONE");
 	else {
 		if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 			printf(" NOFAILSYNC");
 		if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0)
 			printf(" NOAUTOSYNC");
 	}
 	printf("\n");
 	printf("    dflags:");
 	if (md->md_dflags == 0)
 		printf(" NONE");
 	else {
 		if ((md->md_dflags & G_MIRROR_DISK_FLAG_DIRTY) != 0)
 			printf(" DIRTY");
 		if ((md->md_dflags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0)
 			printf(" SYNCHRONIZING");
 		if ((md->md_dflags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0)
 			printf(" FORCE_SYNC");
 		if ((md->md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0)
 			printf(" INACTIVE");
 	}
 	printf("\n");
 	printf("hcprovider: %s\n", md->md_provider);
 	printf("  provsize: %ju\n", (uintmax_t)md->md_provsize);
 	bzero(hash, sizeof(hash));
 	for (i = 0; i < 16; i++) {
 		hash[i * 2] = hex[md->md_hash[i] >> 4];
 		hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f];
 	}
 	printf("  MD5 hash: %s\n", hash);
 }
 #endif	/* !_G_MIRROR_H_ */
Index: projects/bsd_rdma_4_9/sys/i386/i386/machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/i386/i386/machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/i386/i386/machdep.c	(revision 326162)
@@ -1,3045 +1,3039 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_perfmon.h"
 #include "opt_platform.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <isa/rtc.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #include <x86/init.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_APIC
 #include <x86/apicvar.h>
 #endif
 
 #ifdef DEV_ISA
 #include <x86/isa/icu.h>
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern register_t init386(int first);
 extern void dblfault_handler(void);
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 #ifdef PAE
 FEATURE(pae, "Physical Address Extensions");
 #endif
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
  /* Default init_ops implementation. */
  struct init_ops init_ops = {
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 #ifdef DEV_APIC
 	.msi_init =			msi_init,
 #endif
  };
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge.
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_cnt.v_free_count),
 	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	if (p->p_sysent->sv_sigcode_base != 0) {
 		regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 		    szosigcode;
 	} else {
 		/* a.out sysentvec does not use shared page */
 		regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
 	}
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 	    szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	struct segment_descriptor *sdp;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	/*
 	 * Unconditionally fill the fsbase and gsbase into the mcontext.
 	 */
 	sdp = &td->td_pcb->pcb_fsd;
 	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned int)sp & ~0x3F);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base;
 	if (regs->tf_eip == 0)
 		regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct trapframe *regs;
 	struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 			xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 			if (xfpustate_len > cpu_max_ext_state_size -
 			    sizeof(union savefpu)) {
 				uprintf(
 			    "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 				    p->p_pid, td->td_name, xfpustate_len);
 				return (EINVAL);
 			}
 			xfpustate = __builtin_alloca(xfpustate_len);
 			error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 			    xfpustate, xfpustate_len);
 			if (error != 0) {
 				uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 				    p->p_pid, td->td_name);
 				return (error);
 			}
 		} else {
 			xfpustate = NULL;
 			xfpustate_len = 0;
 		}
 		ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate,
 		    xfpustate_len);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	/*
 	 * Reset the fs and gs bases.  The values from the old address
 	 * space do not make sense for the new program.  In particular,
 	 * gsbase might be the TLS base for the old program but the new
 	 * program has no TLS now.
 	 */
 	set_fsbase(td, 0);
 	set_gsbase(td, 0);
 
+	/* Make sure edx is 0x0 on entry. Linux binaries depend on it. */
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = imgp->entry_addr;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = imgp->ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == curpcb) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
 		pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
-
-	/*
-	 * XXX - Linux emulator
-	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
-	 * on it.
-	 */
-	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 static char bootmethod[16] = "BIOS";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	.ssd_base = 0x400,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GLDT_SEL	10 LDT Descriptor */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = sizeof(ldt)-1,
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = (512 * sizeof(union descriptor)-1),
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	.ssd_base = (int) &dblfault_tss,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall),
 #endif
 	IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016llx\n", rxcr(0));
 	if (amd_feature & (AMDID_NX | AMDID_LM))
 		db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t0x%016llx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	if ((cpu_vendor_id == CPU_VENDOR_INTEL ||
 	    cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6)
 		db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR));
 	if (cpu_feature & CPUID_PAT)
 		db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%08x\n", rdr0());
 	db_printf("dr1\t0x%08x\n", rdr1());
 	db_printf("dr2\t0x%08x\n", rdr2());
 	db_printf("dr3\t0x%08x\n", rdr3());
 	db_printf("dr6\t0x%08x\n", rdr6());
 	db_printf("dr7\t0x%08x\n", rdr7());	
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 	
 	if (length == 0)
 		return (1);
 
 #ifndef PAE
 	if (base > 0xffffffff) {
 		printf("%uK of memory above 4GB ignored\n",
 		    (u_int)(length / 1024));
 		return (1);
 	}
 #endif
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = physmap_idx + 2;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 static int
 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
 {
 	if (boothowto & RB_VERBOSE)
 		printf("SMAP type=%02x base=%016llx len=%016llx\n",
 		    smap->type, smap->base, smap->length);
 
 	if (smap->type != SMAP_TYPE_MEMORY)
 		return (1);
 
 	return (add_physmap_entry(smap->base, smap->length, physmap,
 	    physmap_idxp));
 }
 
 static void
 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	struct bios_smap *smap, *smapend;
 	u_int32_t smapsize;
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes SMAP.
 	 */
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++)
 		if (!add_smap_entry(smap, physmap, physmap_idxp))
 			break;
 }
 
 static void
 basemem_setup(void)
 {
 	vm_paddr_t pa;
 	pt_entry_t *pte;
 	int i;
 
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int has_smap, off, physmap_idx, pa_indx, da_indx;
 	u_long memtest;
 	vm_paddr_t physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size, physmem_tunable;
 	int hasbrokenint12, i, res;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_paddr_t pa;
 	struct bios_smap *smap, *smapbase;
 	caddr_t kmdp;
 
 	has_smap = 0;
 	bzero(&vmf, sizeof(vmf));
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 
 	/*
 	 * Check if the loader supplied an SMAP memory map.  If so,
 	 * use that and do not make any VM86 calls.
 	 */
 	physmap_idx = 0;
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf32 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		add_smap_entries(smapbase, physmap, &physmap_idx);
 		has_smap = 1;
 		goto have_smap;
 	}
 
 	/*
 	 * Some newer BIOSes have a broken INT 12H implementation
 	 * which causes a kernel panic immediately.  In this case, we
 	 * need use the SMAP to determine the base memory size.
 	 */
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	if (hasbrokenint12 == 0) {
 		/* Use INT12 to determine base memory size. */
 		vm86_intcall(0x12, &vmf);
 		basemem = vmf.vmf_ax;
 		basemem_setup();
 	}
 
 	/*
 	 * Fetch the memory map with INT 15:E820.  Map page 1 R/W into
 	 * the kernel page table so we can use it as a buffer.  The
 	 * kernel will unmap this page later.
 	 */
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 	KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
 
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		has_smap = 1;
 		if (!add_smap_entry(smap, physmap, &physmap_idx))
 			break;
 	} while (vmf.vmf_ebx != 0);
 
 have_smap:
 	/*
 	 * If we didn't fetch the "base memory" size from INT12,
 	 * figure it out from the SMAP (or just guess).
 	 */
 	if (basemem == 0) {
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (physmap[i] == 0x00000000) {
 				basemem = physmap[i + 1] / 1024;
 				break;
 			}
 		}
 
 		/* XXX: If we couldn't find basemem from SMAP, just guess. */
 		if (basemem == 0)
 			basemem = 640;
 		basemem_setup();
 	}
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed to find an SMAP, figure out the extended
 	 * memory size.  We will then build a simple memory map with
 	 * two segments, one for "base memory" and the second for
 	 * "extended memory".  Note that "extended memory" starts at a
 	 * physical address of 1MB and that both basemem and extmem
 	 * are in units of 1KB.
 	 *
 	 * First, try to fetch the extended memory size via INT 15:E801.
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 		/*
 		 * If INT15:E801 fails, this is our last ditch effort
 		 * to determine the extended memory size.  Currently
 		 * we prefer the RTC value over INT15:88.
 		 */
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 *
 	 * This is especially confusing when it is much larger than the
 	 * memory size and is displayed as "realmem".
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 	 * the amount of memory in the system.
 	 */
 	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	/*
 	 * By default enable the memory test on real hardware, and disable
 	 * it if we appear to be running in a VM.  This avoids touching all
 	 * pages unnecessarily, which doesn't matter on real hardware but is
 	 * bad for shared VM hosts.  Use a general name so that
 	 * one could eventually do more with the code than just disable it.
 	 */
 	memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP3;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR3;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 static void
 i386_kdb_init(void)
 {
 #ifdef DDB
 	db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab);
 #endif
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 register_t
 init386(int first)
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x, pa;
 	struct pcpu *pc;
 	struct xstate_hdr *xhdr;
 	int late_console;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = TD0_KSTACK_PAGES;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 
 	if (bootinfo.bi_envp != 0)
 		init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0);
 	else
 		init_static_kenv(NULL, 0);
 
 	identify_hypervisor();
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	/* Non-late cninit() and printf() can be moved up to here. */
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < nitems(ldt_segs); x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #if defined(PAE) || defined(PAE_TABLES)
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	/* Initialize the tss (except for the final esp0) early for vm86. */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    thread0.td_kstack_pages * PAGE_SIZE - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* Initialize the PIC early for vm86 calls. */
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		i386_kdb_init();
 	}
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	if (late_console)
 		cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 	if (late_console)
 		i386_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	npxinit(true);
 	/*
 	 * Set up thread0 pcb after npxinit calculated pcb + fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	PCPU_SET(curpcb, thread0.td_pcb);
 	/* Move esp0 in the tss to its final place. */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16);
 	gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;	/* clear busy bit */
 	ltr(gsel_tss);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #if defined(PAE) || defined(PAE_TABLES)
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 
 	/* Location of kernel stack for locore */
 	return ((register_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf32 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO);
 	if (tmp == 0)
 		panic("kmem_malloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 	pcb->pcb_gs = rgs();
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	npxgetregs(td);
 	if (cpu_fxsr)
 		npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm,
 		    (struct save87 *)fpregs);
 	else
 		bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs,
 		    sizeof(*fpregs));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	if (cpu_fxsr)
 		npx_set_fpregs_xmm((struct save87 *)fpregs,
 		    &get_pcb_user_save_td(td)->sv_xmm);
 	else
 		bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87,
 		    sizeof(*fpregs));
 	npxuserinited(td);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 	struct segment_descriptor *sdp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	sdp = &td->td_pcb->pcb_fsd;
 	mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	mcp->mc_flags = 0;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	char *xfpustate;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(union savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_fs = mcp->mc_fs;
 	tp->tf_es = mcp->mc_es;
 	tp->tf_ds = mcp->mc_ds;
 	tp->tf_edi = mcp->mc_edi;
 	tp->tf_esi = mcp->mc_esi;
 	tp->tf_ebp = mcp->mc_ebp;
 	tp->tf_ebx = mcp->mc_ebx;
 	tp->tf_edx = mcp->mc_edx;
 	tp->tf_ecx = mcp->mc_ecx;
 	tp->tf_eax = mcp->mc_eax;
 	tp->tf_eip = mcp->mc_eip;
 	tp->tf_eflags = eflags;
 	tp->tf_esp = mcp->mc_esp;
 	tp->tf_ss = mcp->mc_ss;
 	td->td_pcb->pcb_gs = mcp->mc_gs;
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = npxgetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = npxformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(union savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 	    PCB_NPXUSERINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: projects/bsd_rdma_4_9/sys/i386/include/atomic.h
===================================================================
--- projects/bsd_rdma_4_9/sys/i386/include/atomic.h	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/i386/include/atomic.h	(revision 326162)
@@ -1,854 +1,866 @@
 /*-
  * Copyright (c) 1998 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef _MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
 
 #ifndef _SYS_CDEFS_H_
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
 #ifdef _KERNEL
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #endif
 
 #ifndef __OFFSETOF_MONITORBUF
 /*
  * __OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
  *
  * The open-coded number is used instead of the symbolic expression to
  * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
  * An assertion in i386/vm_machdep.c ensures that the value is correct.
  */
 #define	__OFFSETOF_MONITORBUF	0x80
 
 static __inline void
 __mbk(void)
 {
 
 	__asm __volatile("lock; addl $0,%%fs:%0"
 	    : "+m" (*(u_int *)__OFFSETOF_MONITORBUF) : : "memory", "cc");
 }
 
 static __inline void
 __mbu(void)
 {
 
 	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc");
 }
 #endif
 
 /*
  * Various simple operations on memory, each of which is atomic in the
  * presence of interrupts and multiple processors.
  *
  * atomic_set_char(P, V)	(*(u_char *)(P) |= (V))
  * atomic_clear_char(P, V)	(*(u_char *)(P) &= ~(V))
  * atomic_add_char(P, V)	(*(u_char *)(P) += (V))
  * atomic_subtract_char(P, V)	(*(u_char *)(P) -= (V))
  *
  * atomic_set_short(P, V)	(*(u_short *)(P) |= (V))
  * atomic_clear_short(P, V)	(*(u_short *)(P) &= ~(V))
  * atomic_add_short(P, V)	(*(u_short *)(P) += (V))
  * atomic_subtract_short(P, V)	(*(u_short *)(P) -= (V))
  *
  * atomic_set_int(P, V)		(*(u_int *)(P) |= (V))
  * atomic_clear_int(P, V)	(*(u_int *)(P) &= ~(V))
  * atomic_add_int(P, V)		(*(u_int *)(P) += (V))
  * atomic_subtract_int(P, V)	(*(u_int *)(P) -= (V))
  * atomic_swap_int(P, V)	(return (*(u_int *)(P)); *(u_int *)(P) = (V);)
  * atomic_readandclear_int(P)	(return (*(u_int *)(P)); *(u_int *)(P) = 0;)
  *
  * atomic_set_long(P, V)	(*(u_long *)(P) |= (V))
  * atomic_clear_long(P, V)	(*(u_long *)(P) &= ~(V))
  * atomic_add_long(P, V)	(*(u_long *)(P) += (V))
  * atomic_subtract_long(P, V)	(*(u_long *)(P) -= (V))
  * atomic_swap_long(P, V)	(return (*(u_long *)(P)); *(u_long *)(P) = (V);)
  * atomic_readandclear_long(P)	(return (*(u_long *)(P)); *(u_long *)(P) = 0;)
  */
 
 /*
  * The above functions are expanded inline in the statically-linked
  * kernel.  Lock prefixes are generated if an SMP kernel is being
  * built.
  *
  * Kernel modules call real functions which are built into the kernel.
  * This allows kernel modules to be portable between UP and SMP systems.
  */
 #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM)
 #define	ATOMIC_ASM(NAME, TYPE, OP, CONS, V)			\
 void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v);	\
 void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 int	atomic_cmpset_char(volatile u_char *dst, u_char expect, u_char src);
 int	atomic_cmpset_short(volatile u_short *dst, u_short expect, u_short src);
 int	atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src);
 int	atomic_fcmpset_char(volatile u_char *dst, u_char *expect, u_char src);
 int	atomic_fcmpset_short(volatile u_short *dst, u_short *expect,
 	    u_short src);
 int	atomic_fcmpset_int(volatile u_int *dst, u_int *expect, u_int src);
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 int	atomic_testandset_int(volatile u_int *p, u_int v);
 int	atomic_testandclear_int(volatile u_int *p, u_int v);
 void	atomic_thread_fence_acq(void);
 void	atomic_thread_fence_acq_rel(void);
 void	atomic_thread_fence_rel(void);
 void	atomic_thread_fence_seq_cst(void);
 
 #define	ATOMIC_LOAD(TYPE)					\
 u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
 #define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 int		atomic_cmpset_64(volatile uint64_t *, uint64_t, uint64_t);
 uint64_t	atomic_load_acq_64(volatile uint64_t *);
 void		atomic_store_rel_64(volatile uint64_t *, uint64_t);
 uint64_t	atomic_swap_64(volatile uint64_t *, uint64_t);
+uint64_t	atomic_fetchadd_64(volatile uint64_t *, uint64_t);
 
 #else /* !KLD_MODULE && __GNUCLIKE_ASM */
 
 /*
  * For userland, always use lock prefixes so that the binaries will run
  * on both SMP and !SMP systems.
  */
 #if defined(SMP) || !defined(_KERNEL)
 #define	MPLOCKED	"lock ; "
 #else
 #define	MPLOCKED
 #endif
 
 /*
  * The assembly is volatilized to avoid code chunk removal by the compiler.
  * GCC aggressively reorders operations and memory clobbering is necessary
  * in order to avoid that for memory barriers.
  */
 #define	ATOMIC_ASM(NAME, TYPE, OP, CONS, V)		\
 static __inline void					\
 atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
 	__asm __volatile(MPLOCKED OP			\
 	: "+m" (*p)					\
 	: CONS (V)					\
 	: "cc");					\
 }							\
 							\
 static __inline void					\
 atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
 	__asm __volatile(MPLOCKED OP			\
 	: "+m" (*p)					\
 	: CONS (V)					\
 	: "memory", "cc");				\
 }							\
 struct __hack
 
 /*
  * Atomic compare and set, used by the mutex functions.
  *
  * cmpset:
  *	if (*dst == expect)
  *		*dst = src
  *
  * fcmpset:
  *	if (*dst == *expect)
  *		*dst = src
  *	else
  *		*expect = *dst
  *
  * Returns 0 on failure, non-zero on success.
  */
 #define	ATOMIC_CMPSET(TYPE, CONS)			\
 static __inline int					\
 atomic_cmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE expect, u_##TYPE src) \
 {							\
 	u_char res;					\
 							\
 	__asm __volatile(				\
 	"	" MPLOCKED "		"		\
 	"	cmpxchg	%3,%1 ;		"		\
 	"	sete	%0 ;		"		\
 	"# atomic_cmpset_" #TYPE "	"		\
 	: "=q" (res),			/* 0 */		\
 	  "+m" (*dst),			/* 1 */		\
 	  "+a" (expect)			/* 2 */		\
 	: CONS (src)			/* 3 */		\
 	: "memory", "cc");				\
 	return (res);					\
 }							\
 							\
 static __inline int					\
 atomic_fcmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE *expect, u_##TYPE src) \
 {							\
 	u_char res;					\
 							\
 	__asm __volatile(				\
 	"	" MPLOCKED "		"		\
 	"	cmpxchg	%3,%1 ;		"		\
 	"	sete	%0 ;		"		\
 	"# atomic_fcmpset_" #TYPE "	"		\
 	: "=q" (res),			/* 0 */		\
 	  "+m" (*dst),			/* 1 */		\
 	  "+a" (*expect)		/* 2 */		\
 	: CONS (src)			/* 3 */		\
 	: "memory", "cc");				\
 	return (res);					\
 }
 
 ATOMIC_CMPSET(char, "q");
 ATOMIC_CMPSET(short, "r");
 ATOMIC_CMPSET(int, "r");
 
 /*
  * Atomically add the value of v to the integer pointed to by p and return
  * the previous value of *p.
  */
 static __inline u_int
 atomic_fetchadd_int(volatile u_int *p, u_int v)
 {
 
 	__asm __volatile(
 	"	" MPLOCKED "		"
 	"	xaddl	%0,%1 ;		"
 	"# atomic_fetchadd_int"
 	: "+r" (v),			/* 0 */
 	  "+m" (*p)			/* 1 */
 	: : "cc");
 	return (v);
 }
 
 static __inline int
 atomic_testandset_int(volatile u_int *p, u_int v)
 {
 	u_char res;
 
 	__asm __volatile(
 	"	" MPLOCKED "		"
 	"	btsl	%2,%1 ;		"
 	"	setc	%0 ;		"
 	"# atomic_testandset_int"
 	: "=q" (res),			/* 0 */
 	  "+m" (*p)			/* 1 */
 	: "Ir" (v & 0x1f)		/* 2 */
 	: "cc");
 	return (res);
 }
 
 static __inline int
 atomic_testandclear_int(volatile u_int *p, u_int v)
 {
 	u_char res;
 
 	__asm __volatile(
 	"	" MPLOCKED "		"
 	"	btrl	%2,%1 ;		"
 	"	setc	%0 ;		"
 	"# atomic_testandclear_int"
 	: "=q" (res),			/* 0 */
 	  "+m" (*p)			/* 1 */
 	: "Ir" (v & 0x1f)		/* 2 */
 	: "cc");
 	return (res);
 }
 
 /*
  * We assume that a = b will do atomic loads and stores.  Due to the
  * IA32 memory model, a simple store guarantees release semantics.
  *
  * However, a load may pass a store if they are performed on distinct
  * addresses, so we need Store/Load barrier for sequentially
  * consistent fences in SMP kernels.  We use "lock addl $0,mem" for a
  * Store/Load barrier, as recommended by the AMD Software Optimization
  * Guide, and not mfence.  In the kernel, we use a private per-cpu
  * cache line for "mem", to avoid introducing false data
  * dependencies.  In user space, we use the word at the top of the
  * stack.
  *
  * For UP kernels, however, the memory of the single processor is
  * always consistent, so we only need to stop the compiler from
  * reordering accesses in a way that violates the semantics of acquire
  * and release.
  */
 
 #if defined(_KERNEL)
 #if defined(SMP)
 #define	__storeload_barrier()	__mbk()
 #else /* _KERNEL && UP */
 #define	__storeload_barrier()	__compiler_membar()
 #endif /* SMP */
 #else /* !_KERNEL */
 #define	__storeload_barrier()	__mbu()
 #endif /* _KERNEL*/
 
 #define	ATOMIC_LOAD(TYPE)					\
 static __inline u_##TYPE					\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)			\
 {								\
 	u_##TYPE res;						\
 								\
 	res = *p;						\
 	__compiler_membar();					\
 	return (res);						\
 }								\
 struct __hack
 
 #define	ATOMIC_STORE(TYPE)					\
 static __inline void						\
 atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)	\
 {								\
 								\
 	__compiler_membar();					\
 	*p = v;							\
 }								\
 struct __hack
 
 static __inline void
 atomic_thread_fence_acq(void)
 {
 
 	__compiler_membar();
 }
 
 static __inline void
 atomic_thread_fence_rel(void)
 {
 
 	__compiler_membar();
 }
 
 static __inline void
 atomic_thread_fence_acq_rel(void)
 {
 
 	__compiler_membar();
 }
 
 static __inline void
 atomic_thread_fence_seq_cst(void)
 {
 
 	__storeload_barrier();
 }
 
 #ifdef _KERNEL
 
 #ifdef WANT_FUNCTIONS
 int		atomic_cmpset_64_i386(volatile uint64_t *, uint64_t, uint64_t);
 int		atomic_cmpset_64_i586(volatile uint64_t *, uint64_t, uint64_t);
 uint64_t	atomic_load_acq_64_i386(volatile uint64_t *);
 uint64_t	atomic_load_acq_64_i586(volatile uint64_t *);
 void		atomic_store_rel_64_i386(volatile uint64_t *, uint64_t);
 void		atomic_store_rel_64_i586(volatile uint64_t *, uint64_t);
 uint64_t	atomic_swap_64_i386(volatile uint64_t *, uint64_t);
 uint64_t	atomic_swap_64_i586(volatile uint64_t *, uint64_t);
 #endif
 
 /* I486 does not support SMP or CMPXCHG8B. */
 static __inline int
 atomic_cmpset_64_i386(volatile uint64_t *dst, uint64_t expect, uint64_t src)
 {
 	volatile uint32_t *p;
 	u_char res;
 
 	p = (volatile uint32_t *)dst;
 	__asm __volatile(
 	"	pushfl ;		"
 	"	cli ;			"
 	"	xorl	%1,%%eax ;	"
 	"	xorl	%2,%%edx ;	"
 	"	orl	%%edx,%%eax ;	"
 	"	jne	1f ;		"
 	"	movl	%4,%1 ;		"
 	"	movl	%5,%2 ;		"
 	"1:				"
 	"	sete	%3 ;		"
 	"	popfl"
 	: "+A" (expect),		/* 0 */
 	  "+m" (*p),			/* 1 */
 	  "+m" (*(p + 1)),		/* 2 */
 	  "=q" (res)			/* 3 */
 	: "r" ((uint32_t)src),		/* 4 */
 	  "r" ((uint32_t)(src >> 32))	/* 5 */
 	: "memory", "cc");
 	return (res);
 }
 
 static __inline uint64_t
 atomic_load_acq_64_i386(volatile uint64_t *p)
 {
 	volatile uint32_t *q;
 	uint64_t res;
 
 	q = (volatile uint32_t *)p;
 	__asm __volatile(
 	"	pushfl ;		"
 	"	cli ;			"
 	"	movl	%1,%%eax ;	"
 	"	movl	%2,%%edx ;	"
 	"	popfl"
 	: "=&A" (res)			/* 0 */
 	: "m" (*q),			/* 1 */
 	  "m" (*(q + 1))		/* 2 */
 	: "memory");
 	return (res);
 }
 
 static __inline void
 atomic_store_rel_64_i386(volatile uint64_t *p, uint64_t v)
 {
 	volatile uint32_t *q;
 
 	q = (volatile uint32_t *)p;
 	__asm __volatile(
 	"	pushfl ;		"
 	"	cli ;			"
 	"	movl	%%eax,%0 ;	"
 	"	movl	%%edx,%1 ;	"
 	"	popfl"
 	: "=m" (*q),			/* 0 */
 	  "=m" (*(q + 1))		/* 1 */
 	: "A" (v)			/* 2 */
 	: "memory");
 }
 
 static __inline uint64_t
 atomic_swap_64_i386(volatile uint64_t *p, uint64_t v)
 {
 	volatile uint32_t *q;
 	uint64_t res;
 
 	q = (volatile uint32_t *)p;
 	__asm __volatile(
 	"	pushfl ;		"
 	"	cli ;			"
 	"	movl	%1,%%eax ;	"
 	"	movl	%2,%%edx ;	"
 	"	movl	%4,%2 ;		"
 	"	movl	%3,%1 ;		"
 	"	popfl"
 	: "=&A" (res),			/* 0 */
 	  "+m" (*q),			/* 1 */
 	  "+m" (*(q + 1))		/* 2 */
 	: "r" ((uint32_t)v),		/* 3 */
 	  "r" ((uint32_t)(v >> 32)));	/* 4 */
 	return (res);
 }
 
 static __inline int
 atomic_cmpset_64_i586(volatile uint64_t *dst, uint64_t expect, uint64_t src)
 {
 	u_char res;
 
 	__asm __volatile(
 	"	" MPLOCKED "		"
 	"	cmpxchg8b %1 ;		"
 	"	sete	%0"
 	: "=q" (res),			/* 0 */
 	  "+m" (*dst),			/* 1 */
 	  "+A" (expect)			/* 2 */
 	: "b" ((uint32_t)src),		/* 3 */
 	  "c" ((uint32_t)(src >> 32))	/* 4 */
 	: "memory", "cc");
 	return (res);
 }
 
 static __inline uint64_t
 atomic_load_acq_64_i586(volatile uint64_t *p)
 {
 	uint64_t res;
 
 	__asm __volatile(
 	"	movl	%%ebx,%%eax ;	"
 	"	movl	%%ecx,%%edx ;	"
 	"	" MPLOCKED "		"
 	"	cmpxchg8b %1"
 	: "=&A" (res),			/* 0 */
 	  "+m" (*p)			/* 1 */
 	: : "memory", "cc");
 	return (res);
 }
 
 static __inline void
 atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v)
 {
 
 	__asm __volatile(
 	"	movl	%%eax,%%ebx ;	"
 	"	movl	%%edx,%%ecx ;	"
 	"1:				"
 	"	" MPLOCKED "		"
 	"	cmpxchg8b %0 ;		"
 	"	jne	1b"
 	: "+m" (*p),			/* 0 */
 	  "+A" (v)			/* 1 */
 	: : "ebx", "ecx", "memory", "cc");
 }
 
 static __inline uint64_t
 atomic_swap_64_i586(volatile uint64_t *p, uint64_t v)
 {
 
 	__asm __volatile(
 	"	movl	%%eax,%%ebx ;	"
 	"	movl	%%edx,%%ecx ;	"
 	"1:				"
 	"	" MPLOCKED "		"
 	"	cmpxchg8b %0 ;		"
 	"	jne	1b"
 	: "+m" (*p),			/* 0 */
 	  "+A" (v)			/* 1 */
 	: : "ebx", "ecx", "memory", "cc");
 	return (v);
 }
 
 static __inline int
 atomic_cmpset_64(volatile uint64_t *dst, uint64_t expect, uint64_t src)
 {
 
 	if ((cpu_feature & CPUID_CX8) == 0)
 		return (atomic_cmpset_64_i386(dst, expect, src));
 	else
 		return (atomic_cmpset_64_i586(dst, expect, src));
 }
 
 static __inline uint64_t
 atomic_load_acq_64(volatile uint64_t *p)
 {
 
 	if ((cpu_feature & CPUID_CX8) == 0)
 		return (atomic_load_acq_64_i386(p));
 	else
 		return (atomic_load_acq_64_i586(p));
 }
 
 static __inline void
 atomic_store_rel_64(volatile uint64_t *p, uint64_t v)
 {
 
 	if ((cpu_feature & CPUID_CX8) == 0)
 		atomic_store_rel_64_i386(p, v);
 	else
 		atomic_store_rel_64_i586(p, v);
 }
 
 static __inline uint64_t
 atomic_swap_64(volatile uint64_t *p, uint64_t v)
 {
 
 	if ((cpu_feature & CPUID_CX8) == 0)
 		return (atomic_swap_64_i386(p, v));
 	else
 		return (atomic_swap_64_i586(p, v));
+}
+
+static __inline uint64_t
+atomic_fetchadd_64(volatile uint64_t *p, uint64_t v)
+{
+
+	for (;;) {
+		uint64_t t = *p;
+		if (atomic_cmpset_64(p, t, t + v))
+			return (t);
+	}
 }
 
 #endif /* _KERNEL */
 
 #endif /* KLD_MODULE || !__GNUCLIKE_ASM */
 
 ATOMIC_ASM(set,	     char,  "orb %b1,%0",  "iq",  v);
 ATOMIC_ASM(clear,    char,  "andb %b1,%0", "iq", ~v);
 ATOMIC_ASM(add,	     char,  "addb %b1,%0", "iq",  v);
 ATOMIC_ASM(subtract, char,  "subb %b1,%0", "iq",  v);
 
 ATOMIC_ASM(set,	     short, "orw %w1,%0",  "ir",  v);
 ATOMIC_ASM(clear,    short, "andw %w1,%0", "ir", ~v);
 ATOMIC_ASM(add,	     short, "addw %w1,%0", "ir",  v);
 ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir",  v);
 
 ATOMIC_ASM(set,	     int,   "orl %1,%0",   "ir",  v);
 ATOMIC_ASM(clear,    int,   "andl %1,%0",  "ir", ~v);
 ATOMIC_ASM(add,	     int,   "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, int,   "subl %1,%0",  "ir",  v);
 
 ATOMIC_ASM(set,	     long,  "orl %1,%0",   "ir",  v);
 ATOMIC_ASM(clear,    long,  "andl %1,%0",  "ir", ~v);
 ATOMIC_ASM(add,	     long,  "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  "ir",  v);
 
 #define	ATOMIC_LOADSTORE(TYPE)				\
 	ATOMIC_LOAD(TYPE);				\
 	ATOMIC_STORE(TYPE)
 
 ATOMIC_LOADSTORE(char);
 ATOMIC_LOADSTORE(short);
 ATOMIC_LOADSTORE(int);
 ATOMIC_LOADSTORE(long);
 
 #undef ATOMIC_ASM
 #undef ATOMIC_LOAD
 #undef ATOMIC_STORE
 #undef ATOMIC_LOADSTORE
 
 #ifndef WANT_FUNCTIONS
 
 static __inline int
 atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src)
 {
 
 	return (atomic_cmpset_int((volatile u_int *)dst, (u_int)expect,
 	    (u_int)src));
 }
 
 static __inline u_long
 atomic_fetchadd_long(volatile u_long *p, u_long v)
 {
 
 	return (atomic_fetchadd_int((volatile u_int *)p, (u_int)v));
 }
 
 static __inline int
 atomic_testandset_long(volatile u_long *p, u_int v)
 {
 
 	return (atomic_testandset_int((volatile u_int *)p, v));
 }
 
 static __inline int
 atomic_testandclear_long(volatile u_long *p, u_int v)
 {
 
 	return (atomic_testandclear_int((volatile u_int *)p, v));
 }
 
 /* Read the current value and store a new value in the destination. */
 #ifdef __GNUCLIKE_ASM
 
 static __inline u_int
 atomic_swap_int(volatile u_int *p, u_int v)
 {
 
 	__asm __volatile(
 	"	xchgl	%1,%0 ;		"
 	"# atomic_swap_int"
 	: "+r" (v),			/* 0 */
 	  "+m" (*p));			/* 1 */
 	return (v);
 }
 
 static __inline u_long
 atomic_swap_long(volatile u_long *p, u_long v)
 {
 
 	return (atomic_swap_int((volatile u_int *)p, (u_int)v));
 }
 
 #else /* !__GNUCLIKE_ASM */
 
 u_int	atomic_swap_int(volatile u_int *p, u_int v);
 u_long	atomic_swap_long(volatile u_long *p, u_long v);
 
 #endif /* __GNUCLIKE_ASM */
 
 #define	atomic_set_acq_char		atomic_set_barr_char
 #define	atomic_set_rel_char		atomic_set_barr_char
 #define	atomic_clear_acq_char		atomic_clear_barr_char
 #define	atomic_clear_rel_char		atomic_clear_barr_char
 #define	atomic_add_acq_char		atomic_add_barr_char
 #define	atomic_add_rel_char		atomic_add_barr_char
 #define	atomic_subtract_acq_char	atomic_subtract_barr_char
 #define	atomic_subtract_rel_char	atomic_subtract_barr_char
 #define	atomic_cmpset_acq_char		atomic_cmpset_char
 #define	atomic_cmpset_rel_char		atomic_cmpset_char
 #define	atomic_fcmpset_acq_char		atomic_fcmpset_char
 #define	atomic_fcmpset_rel_char		atomic_fcmpset_char
 
 #define	atomic_set_acq_short		atomic_set_barr_short
 #define	atomic_set_rel_short		atomic_set_barr_short
 #define	atomic_clear_acq_short		atomic_clear_barr_short
 #define	atomic_clear_rel_short		atomic_clear_barr_short
 #define	atomic_add_acq_short		atomic_add_barr_short
 #define	atomic_add_rel_short		atomic_add_barr_short
 #define	atomic_subtract_acq_short	atomic_subtract_barr_short
 #define	atomic_subtract_rel_short	atomic_subtract_barr_short
 #define	atomic_cmpset_acq_short		atomic_cmpset_short
 #define	atomic_cmpset_rel_short		atomic_cmpset_short
 #define	atomic_fcmpset_acq_short	atomic_fcmpset_short
 #define	atomic_fcmpset_rel_short	atomic_fcmpset_short
 
 #define	atomic_set_acq_int		atomic_set_barr_int
 #define	atomic_set_rel_int		atomic_set_barr_int
 #define	atomic_clear_acq_int		atomic_clear_barr_int
 #define	atomic_clear_rel_int		atomic_clear_barr_int
 #define	atomic_add_acq_int		atomic_add_barr_int
 #define	atomic_add_rel_int		atomic_add_barr_int
 #define	atomic_subtract_acq_int		atomic_subtract_barr_int
 #define	atomic_subtract_rel_int		atomic_subtract_barr_int
 #define	atomic_cmpset_acq_int		atomic_cmpset_int
 #define	atomic_cmpset_rel_int		atomic_cmpset_int
 #define	atomic_fcmpset_acq_int		atomic_fcmpset_int
 #define	atomic_fcmpset_rel_int		atomic_fcmpset_int
 
 #define	atomic_set_acq_long		atomic_set_barr_long
 #define	atomic_set_rel_long		atomic_set_barr_long
 #define	atomic_clear_acq_long		atomic_clear_barr_long
 #define	atomic_clear_rel_long		atomic_clear_barr_long
 #define	atomic_add_acq_long		atomic_add_barr_long
 #define	atomic_add_rel_long		atomic_add_barr_long
 #define	atomic_subtract_acq_long	atomic_subtract_barr_long
 #define	atomic_subtract_rel_long	atomic_subtract_barr_long
 #define	atomic_cmpset_acq_long		atomic_cmpset_long
 #define	atomic_cmpset_rel_long		atomic_cmpset_long
 #define	atomic_fcmpset_acq_long		atomic_fcmpset_long
 #define	atomic_fcmpset_rel_long		atomic_fcmpset_long
 
 #define	atomic_readandclear_int(p)	atomic_swap_int(p, 0)
 #define	atomic_readandclear_long(p)	atomic_swap_long(p, 0)
 
 /* Operations on 8-bit bytes. */
 #define	atomic_set_8		atomic_set_char
 #define	atomic_set_acq_8	atomic_set_acq_char
 #define	atomic_set_rel_8	atomic_set_rel_char
 #define	atomic_clear_8		atomic_clear_char
 #define	atomic_clear_acq_8	atomic_clear_acq_char
 #define	atomic_clear_rel_8	atomic_clear_rel_char
 #define	atomic_add_8		atomic_add_char
 #define	atomic_add_acq_8	atomic_add_acq_char
 #define	atomic_add_rel_8	atomic_add_rel_char
 #define	atomic_subtract_8	atomic_subtract_char
 #define	atomic_subtract_acq_8	atomic_subtract_acq_char
 #define	atomic_subtract_rel_8	atomic_subtract_rel_char
 #define	atomic_load_acq_8	atomic_load_acq_char
 #define	atomic_store_rel_8	atomic_store_rel_char
 #define	atomic_cmpset_8		atomic_cmpset_char
 #define	atomic_cmpset_acq_8	atomic_cmpset_acq_char
 #define	atomic_cmpset_rel_8	atomic_cmpset_rel_char
 #define	atomic_fcmpset_8	atomic_fcmpset_char
 #define	atomic_fcmpset_acq_8	atomic_fcmpset_acq_char
 #define	atomic_fcmpset_rel_8	atomic_fcmpset_rel_char
 
 /* Operations on 16-bit words. */
 #define	atomic_set_16		atomic_set_short
 #define	atomic_set_acq_16	atomic_set_acq_short
 #define	atomic_set_rel_16	atomic_set_rel_short
 #define	atomic_clear_16		atomic_clear_short
 #define	atomic_clear_acq_16	atomic_clear_acq_short
 #define	atomic_clear_rel_16	atomic_clear_rel_short
 #define	atomic_add_16		atomic_add_short
 #define	atomic_add_acq_16	atomic_add_acq_short
 #define	atomic_add_rel_16	atomic_add_rel_short
 #define	atomic_subtract_16	atomic_subtract_short
 #define	atomic_subtract_acq_16	atomic_subtract_acq_short
 #define	atomic_subtract_rel_16	atomic_subtract_rel_short
 #define	atomic_load_acq_16	atomic_load_acq_short
 #define	atomic_store_rel_16	atomic_store_rel_short
 #define	atomic_cmpset_16	atomic_cmpset_short
 #define	atomic_cmpset_acq_16	atomic_cmpset_acq_short
 #define	atomic_cmpset_rel_16	atomic_cmpset_rel_short
 #define	atomic_fcmpset_16	atomic_fcmpset_short
 #define	atomic_fcmpset_acq_16	atomic_fcmpset_acq_short
 #define	atomic_fcmpset_rel_16	atomic_fcmpset_rel_short
 
 /* Operations on 32-bit double words. */
 #define	atomic_set_32		atomic_set_int
 #define	atomic_set_acq_32	atomic_set_acq_int
 #define	atomic_set_rel_32	atomic_set_rel_int
 #define	atomic_clear_32		atomic_clear_int
 #define	atomic_clear_acq_32	atomic_clear_acq_int
 #define	atomic_clear_rel_32	atomic_clear_rel_int
 #define	atomic_add_32		atomic_add_int
 #define	atomic_add_acq_32	atomic_add_acq_int
 #define	atomic_add_rel_32	atomic_add_rel_int
 #define	atomic_subtract_32	atomic_subtract_int
 #define	atomic_subtract_acq_32	atomic_subtract_acq_int
 #define	atomic_subtract_rel_32	atomic_subtract_rel_int
 #define	atomic_load_acq_32	atomic_load_acq_int
 #define	atomic_store_rel_32	atomic_store_rel_int
 #define	atomic_cmpset_32	atomic_cmpset_int
 #define	atomic_cmpset_acq_32	atomic_cmpset_acq_int
 #define	atomic_cmpset_rel_32	atomic_cmpset_rel_int
 #define	atomic_fcmpset_32	atomic_fcmpset_int
 #define	atomic_fcmpset_acq_32	atomic_fcmpset_acq_int
 #define	atomic_fcmpset_rel_32	atomic_fcmpset_rel_int
 #define	atomic_swap_32		atomic_swap_int
 #define	atomic_readandclear_32	atomic_readandclear_int
 #define	atomic_fetchadd_32	atomic_fetchadd_int
 #define	atomic_testandset_32	atomic_testandset_int
 #define	atomic_testandclear_32	atomic_testandclear_int
 
 /* Operations on pointers. */
 #define	atomic_set_ptr(p, v) \
 	atomic_set_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_set_acq_ptr(p, v) \
 	atomic_set_acq_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_set_rel_ptr(p, v) \
 	atomic_set_rel_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_clear_ptr(p, v) \
 	atomic_clear_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_clear_acq_ptr(p, v) \
 	atomic_clear_acq_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_clear_rel_ptr(p, v) \
 	atomic_clear_rel_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_add_ptr(p, v) \
 	atomic_add_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_add_acq_ptr(p, v) \
 	atomic_add_acq_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_add_rel_ptr(p, v) \
 	atomic_add_rel_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_subtract_ptr(p, v) \
 	atomic_subtract_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_subtract_acq_ptr(p, v) \
 	atomic_subtract_acq_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_subtract_rel_ptr(p, v) \
 	atomic_subtract_rel_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_load_acq_ptr(p) \
 	atomic_load_acq_int((volatile u_int *)(p))
 #define	atomic_store_rel_ptr(p, v) \
 	atomic_store_rel_int((volatile u_int *)(p), (v))
 #define	atomic_cmpset_ptr(dst, old, new) \
 	atomic_cmpset_int((volatile u_int *)(dst), (u_int)(old), (u_int)(new))
 #define	atomic_cmpset_acq_ptr(dst, old, new) \
 	atomic_cmpset_acq_int((volatile u_int *)(dst), (u_int)(old), \
 	    (u_int)(new))
 #define	atomic_cmpset_rel_ptr(dst, old, new) \
 	atomic_cmpset_rel_int((volatile u_int *)(dst), (u_int)(old), \
 	    (u_int)(new))
 #define	atomic_fcmpset_ptr(dst, old, new) \
 	atomic_fcmpset_int((volatile u_int *)(dst), (u_int *)(old), (u_int)(new))
 #define	atomic_fcmpset_acq_ptr(dst, old, new) \
 	atomic_fcmpset_acq_int((volatile u_int *)(dst), (u_int *)(old), \
 	    (u_int)(new))
 #define	atomic_fcmpset_rel_ptr(dst, old, new) \
 	atomic_fcmpset_rel_int((volatile u_int *)(dst), (u_int *)(old), \
 	    (u_int)(new))
 #define	atomic_swap_ptr(p, v) \
 	atomic_swap_int((volatile u_int *)(p), (u_int)(v))
 #define	atomic_readandclear_ptr(p) \
 	atomic_readandclear_int((volatile u_int *)(p))
 
 #endif /* !WANT_FUNCTIONS */
 
 #if defined(_KERNEL)
 #define	mb()	__mbk()
 #define	wmb()	__mbk()
 #define	rmb()	__mbk()
 #else
 #define	mb()	__mbu()
 #define	wmb()	__mbu()
 #define	rmb()	__mbu()
 #endif
 
 #endif /* !_MACHINE_ATOMIC_H_ */
Index: projects/bsd_rdma_4_9/sys/kern/init_main.c
===================================================================
--- projects/bsd_rdma_4_9/sys/kern/init_main.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/kern/init_main.c	(revision 326162)
@@ -1,875 +1,875 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_init_path.h"
 #include "opt_verbose_sysinit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/exec.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_domain.h>
 #include <sys/copyright.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 struct thread0_storage thread0_st __aligned(32);
 struct	vmspace vmspace0;
 struct	proc *initproc;
 
 #ifndef BOOTHOWTO
 #define	BOOTHOWTO	0
 #endif
 int	boothowto = BOOTHOWTO;	/* initialized so that it can be patched */
 SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
 	"Boot control flags, passed from loader");
 
 #ifndef BOOTVERBOSE
 #define	BOOTVERBOSE	0
 #endif
 int	bootverbose = BOOTVERBOSE;
 SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
 	"Control the output of verbose kernel messages");
 
 #ifdef INVARIANTS
 FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance");
 #endif
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
 
 /*
  * The sysinit table itself.  Items are checked off as the are run.
  * If we want to register new sysinit types, add them to newsysinit.
  */
 SET_DECLARE(sysinit_set, struct sysinit);
 struct sysinit **sysinit, **sysinit_end;
 struct sysinit **newsysinit, **newsysinit_end;
 
 EVENTHANDLER_LIST_DECLARE(process_init);
 EVENTHANDLER_LIST_DECLARE(thread_init);
 EVENTHANDLER_LIST_DECLARE(process_ctor);
 EVENTHANDLER_LIST_DECLARE(thread_ctor);
 
 /*
  * Merge a new sysinit set into the current set, reallocating it if
  * necessary.  This can only be called after malloc is running.
  */
 void
 sysinit_add(struct sysinit **set, struct sysinit **set_end)
 {
 	struct sysinit **newset;
 	struct sysinit **sipp;
 	struct sysinit **xipp;
 	int count;
 
 	count = set_end - set;
 	if (newsysinit)
 		count += newsysinit_end - newsysinit;
 	else
 		count += sysinit_end - sysinit;
 	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
 	if (newset == NULL)
 		panic("cannot malloc for sysinit");
 	xipp = newset;
 	if (newsysinit)
 		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
 			*xipp++ = *sipp;
 	else
 		for (sipp = sysinit; sipp < sysinit_end; sipp++)
 			*xipp++ = *sipp;
 	for (sipp = set; sipp < set_end; sipp++)
 		*xipp++ = *sipp;
 	if (newsysinit)
 		free(newsysinit, M_TEMP);
 	newsysinit = newset;
 	newsysinit_end = newset + count;
 }
 
 #if defined (DDB) && defined(VERBOSE_SYSINIT)
 static const char *
 symbol_name(vm_offset_t va, db_strategy_t strategy)
 {
 	const char *name;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 	if (va == 0)
 		return (NULL);
 	sym = db_search_symbol(va, strategy, &offset);
 	if (offset != 0)
 		return (NULL);
 	db_symbol_values(sym, &name, NULL);
 	return (name);
 }
 #endif
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads".
  */
 void
 mi_startup(void)
 {
 
 	struct sysinit **sipp;	/* system initialization*/
 	struct sysinit **xipp;	/* interior loop of sort*/
 	struct sysinit *save;	/* bubble*/
 
 #if defined(VERBOSE_SYSINIT)
 	int last;
 	int verbose;
 #endif
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	if (sysinit == NULL) {
 		sysinit = SET_BEGIN(sysinit_set);
 		sysinit_end = SET_LIMIT(sysinit_set);
 	}
 
 restart:
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			     ((*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 #if defined(VERBOSE_SYSINIT)
 	last = SI_SUB_COPYRIGHT;
 	verbose = 0;
 #if !defined(DDB)
 	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
 #endif
 #endif
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
 #if defined(VERBOSE_SYSINIT)
 		if ((*sipp)->subsystem > last) {
 			verbose = 1;
 			last = (*sipp)->subsystem;
 			printf("subsystem %x\n", last);
 		}
 		if (verbose) {
 #if defined(DDB)
 			const char *func, *data;
 
 			func = symbol_name((vm_offset_t)(*sipp)->func,
 			    DB_STGY_PROC);
 			data = symbol_name((vm_offset_t)(*sipp)->udata,
 			    DB_STGY_ANY);
 			if (func != NULL && data != NULL)
 				printf("   %s(&%s)... ", func, data);
 			else if (func != NULL)
 				printf("   %s(%p)... ", func, (*sipp)->udata);
 			else
 #endif
 				printf("   %p(%p)... ", (*sipp)->func,
 				    (*sipp)->udata);
 		}
 #endif
 
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
 #if defined(VERBOSE_SYSINIT)
 		if (verbose)
 			printf("done.\n");
 #endif
 
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
 		/* Check if we've installed more sysinit items via KLD */
 		if (newsysinit != NULL) {
 			if (sysinit != SET_BEGIN(sysinit_set))
 				free(sysinit, M_TEMP);
 			sysinit = newsysinit;
 			sysinit_end = newsysinit_end;
 			newsysinit = NULL;
 			newsysinit_end = NULL;
 			goto restart;
 		}
 	}
 
 	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(&Giant);
 
 	/*
 	 * Now hand over this thread to swapper.
 	 */
 	swapper();
 	/* NOTREACHED*/
 }
 
 static void
 print_caddr_t(void *data)
 {
 	printf("%s", (char *)data);
 }
 
 static void
 print_version(void *data __unused)
 {
 	int len;
 
 	/* Strip a trailing newline from version. */
 	len = strlen(version);
 	while (len > 0 && version[len - 1] == '\n')
 		len--;
 	printf("%.*s %s\n", len, version, machine);
 	printf("%s\n", compiler_version);
 }
 
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
     copyright);
 SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
     trademark);
 SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);
 
 #ifdef WITNESS
 static char wit_warn[] =
      "WARNING: WITNESS option enabled, expect reduced performance.\n";
 SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
    print_caddr_t, wit_warn);
 SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1,
    print_caddr_t, wit_warn);
 #endif
 
 #ifdef DIAGNOSTIC
 static char diag_warn[] =
      "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
 SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
     print_caddr_t, diag_warn);
 SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2,
     print_caddr_t, diag_warn);
 #endif
 
 static int
 null_fetch_syscall_args(struct thread *td __unused)
 {
 
 	panic("null_fetch_syscall_args");
 }
 
 static void
 null_set_syscall_retval(struct thread *td __unused, int error __unused)
 {
 
 	panic("null_set_syscall_retval");
 }
 
 struct sysentvec null_sysvec = {
 	.sv_size	= 0,
 	.sv_table	= NULL,
 	.sv_mask	= 0,
 	.sv_errsize	= 0,
 	.sv_errtbl	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= NULL,
 	.sv_sendsig	= NULL,
 	.sv_sigcode	= NULL,
 	.sv_szsigcode	= NULL,
 	.sv_name	= "null",
 	.sv_coredump	= NULL,
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= 0,
 	.sv_pagesize	= PAGE_SIZE,
 	.sv_minuser	= VM_MIN_ADDRESS,
 	.sv_maxuser	= VM_MAXUSER_ADDRESS,
 	.sv_usrstack	= USRSTACK,
 	.sv_psstrings	= PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= NULL,
 	.sv_setregs	= NULL,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
 	.sv_flags	= 0,
 	.sv_set_syscall_retval = null_set_syscall_retval,
 	.sv_fetch_syscall_args = null_fetch_syscall_args,
 	.sv_syscallnames = NULL,
 	.sv_schedtail	= NULL,
 	.sv_thread_detach = NULL,
 	.sv_trap	= NULL,
 };
 
 /*
  * The two following SYSINIT's are proc0 specific glue code.  I am not
  * convinced that they can not be safely combined, but their order of
  * operation has been maintained as the same as the original init_main.c
  * for right now.
  */
 /* ARGSUSED*/
 static void
 proc0_init(void *dummy __unused)
 {
 	struct proc *p;
 	struct thread *td;
 	struct ucred *newcred;
 	struct uidinfo tmpuinfo;
 	struct loginclass tmplc = {
 		.lc_name = "",
 	};
 	vm_paddr_t pageablemem;
 	int i;
 
 	GIANT_REQUIRED;
 	p = &proc0;
 	td = &thread0;
 	
 	/*
 	 * Initialize magic number and osrel.
 	 */
 	p->p_magic = P_MAGIC;
 	p->p_osrel = osreldate;
 
 	/*
 	 * Initialize thread and process structures.
 	 */
 	procinit();	/* set up proc zone */
 	threadinit();	/* set up UMA zones */
 
 	/*
 	 * Initialise scheduler resources.
 	 * Add scheduler specific parts to proc, thread as needed.
 	 */
 	schedinit();	/* scheduler gets its house in order */
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
 	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
 	refcount_init(&session0.s_count, 1);
 	session0.s_leader = p;
 
 	p->p_sysent = &null_sysvec;
 	p->p_flag = P_SYSTEM | P_INMEM | P_KPROC;
 	p->p_flag2 = 0;
 	p->p_state = PRS_NORMAL;
 	p->p_klist = knlist_alloc(&p->p_mtx);
 	STAILQ_INIT(&p->p_ktr);
 	p->p_nice = NZERO;
 	/* pid_max cannot be greater than PID_MAX */
 	td->td_tid = PID_MAX + 1;
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	td->td_state = TDS_RUNNING;
 	td->td_pri_class = PRI_TIMESHARE;
 	td->td_user_pri = PUSER;
 	td->td_base_user_pri = PUSER;
 	td->td_lend_user_pri = PRI_MAX;
 	td->td_priority = PVM;
 	td->td_base_pri = PVM;
 	td->td_oncpu = curcpu;
 	td->td_flags = TDF_INMEM;
 	td->td_pflags = TDP_KTHREAD;
 	td->td_cpuset = cpuset_thread0();
 	vm_domain_policy_init(&td->td_vm_dom_policy);
 	vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1);
 	vm_domain_policy_init(&p->p_vm_dom_policy);
 	vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1);
 	prison0_init();
 	p->p_peers = 0;
 	p->p_leader = p;
 	p->p_reaper = p;
 	LIST_INIT(&p->p_reaplist);
 
 	strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
 	strncpy(td->td_name, "swapper", sizeof (td->td_name));
 
 	callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
 	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
 	callout_init(&td->td_slpcallout, 1);
 
 	/* Create credentials. */
 	newcred = crget();
 	newcred->cr_ngroups = 1;	/* group 0 */
 	/* A hack to prevent uifind from tripping over NULL pointers. */
 	curthread->td_ucred = newcred;
 	tmpuinfo.ui_uid = 1;
 	newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo;
 	newcred->cr_uidinfo = uifind(0);
 	newcred->cr_ruidinfo = uifind(0);
 	newcred->cr_loginclass = &tmplc;
 	newcred->cr_loginclass = loginclass_find("default");
 	/* End hack. creds get properly set later with thread_cow_get_proc */
 	curthread->td_ucred = NULL;
 	newcred->cr_prison = &prison0;
 	proc_set_cred_init(p, newcred);
 #ifdef AUDIT
 	audit_cred_kproc0(newcred);
 #endif
 #ifdef MAC
 	mac_cred_create_swapper(newcred);
 #endif
 	/* Create sigacts. */
 	p->p_sigacts = sigacts_alloc();
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 
 	/* Create the file descriptor table. */
 	p->p_fd = fdinit(NULL, false);
 	p->p_fdtol = NULL;
 
 	/* Create the limits structures. */
 	p->p_limit = lim_alloc();
 	for (i = 0; i < RLIM_NLIMITS; i++)
 		p->p_limit->pl_rlimit[i].rlim_cur =
 		    p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
 	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
 	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
 	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
 	/* Cast to avoid overflow on i386/PAE. */
 	pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count);
 	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
 	p->p_cpulimit = RLIM_INFINITY;
 
 	PROC_LOCK(p);
 	thread_cow_get_proc(td, p);
 	PROC_UNLOCK(p);
 
 	/* Initialize resource accounting structures. */
 	racct_create(&p->p_racct);
 
 	p->p_stats = pstats_alloc();
 
 	/* Allocate a prototype map so we have something to fork. */
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	pmap_pinit0(vmspace_pmap(&vmspace0));
 
 	/*
 	 * proc0 is not expected to enter usermode, so there is no special
 	 * handling for sv_minuser here, like is done for exec_new_vmspace().
 	 */
 	vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
 	    p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
 
 	/*
 	 * Call the init and ctor for the new thread and proc.  We wait
 	 * to do this until all other structures are fairly sane.
 	 */
 	EVENTHANDLER_DIRECT_INVOKE(process_init, p);
 	EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
 	EVENTHANDLER_DIRECT_INVOKE(process_ctor, p);
 	EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
 	PROC_LOCK(p);
 	racct_add_force(p, RACCT_NPROC, 1);
 	PROC_UNLOCK(p);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
 
 /* ARGSUSED*/
 static void
 proc0_post(void *dummy __unused)
 {
 	struct timespec ts;
 	struct proc *p;
 	struct rusage ru;
 	struct thread *td;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the filesystem.  Pretend that proc0 started now.
 	 */
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		microuptime(&p->p_stats->p_start);
 		PROC_STATLOCK(p);
 		rufetch(p, &ru);	/* Clears thread stats */
 		PROC_STATUNLOCK(p);
 		p->p_rux.rux_runtime = 0;
 		p->p_rux.rux_uticks = 0;
 		p->p_rux.rux_sticks = 0;
 		p->p_rux.rux_iticks = 0;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			td->td_runtime = 0;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	PCPU_SET(switchtime, cpu_ticks());
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Give the ``random'' number generator a thump.
 	 */
 	nanotime(&ts);
 	srandom(ts.tv_sec ^ ts.tv_nsec);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);
 
 static void
 random_init(void *dummy __unused)
 {
 
 	/*
 	 * After CPU has been started we have some randomness on most
 	 * platforms via get_cyclecount().  For platforms that don't
 	 * we will reseed random(9) in proc0_post() as well.
 	 */
 	srandom(get_cyclecount());
 }
 SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL);
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 /*
  * List of paths to try when searching for "init".
  */
 static char init_path[MAXPATHLEN] =
 #ifdef	INIT_PATH
     __XSTRING(INIT_PATH);
 #else
     "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init";
 #endif
 SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
 	"Path used to search the init process");
 
 /*
  * Shutdown timeout of init(8).
  * Unused within kernel, but used to control init(8), hence do not remove.
  */
 #ifndef INIT_SHUTDOWN_TIMEOUT
 #define INIT_SHUTDOWN_TIMEOUT 120
 #endif
 static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
 SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
 	CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
 	"Unused within kernel, but used to control init(8)");
 
 /*
  * Start the initial user process; try exec'ing each pathname in init_path.
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(void *dummy)
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, error;
 	char *var, *path, *next, *s;
 	char *ucp, **uap, *arg0, *arg1;
 	struct thread *td;
 	struct proc *p;
 
 	mtx_lock(&Giant);
 
 	GIANT_REQUIRED;
 
 	td = curthread;
 	p = td->td_proc;
 
 	vfs_mountroot();
 
 	/* Wipe GELI passphrase from the environment. */
 	kern_unsetenv("kern.geom.eli.passphrase");
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0,
 	    VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	if ((var = kern_getenv("init_path")) != NULL) {
 		strlcpy(init_path, var, sizeof(init_path));
 		freeenv(var);
 	}
 	
 	for (path = init_path; *path != '\0'; path = next) {
 		while (*path == ':')
 			path++;
 		if (*path == '\0')
 			break;
 		for (next = path; *next != '\0' && *next != ':'; next++)
 			/* nothing */ ;
 		if (bootverbose)
 			printf("start_init: trying %.*s\n", (int)(next - path),
 			    path);
 			
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)p->p_sysent->sv_usrstack;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		(void)subyte(--ucp, 0);
 		for (s = next - 1; s >= path; s--)
 			(void)subyte(--ucp, *s);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t));
 		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise, return via fork_trampoline() all the way
 		 * to user mode as init!
 		 */
-		if ((error = sys_execve(td, &args)) == 0) {
+		if ((error = sys_execve(td, &args)) == EJUSTRETURN) {
 			mtx_unlock(&Giant);
 			return;
 		}
 		if (error != ENOENT)
 			printf("exec %.*s: error %d\n", (int)(next - path), 
 			    path, error);
 	}
 	printf("init: not found in path %s\n", init_path);
 	panic("no init");
 }
 
 /*
  * Like kproc_create(), but runs in its own address space.
  * We do this early to reserve pid 1.
  *
  * Note special case - do not make it runnable yet.  Other work
  * in progress will change this more.
  */
 static void
 create_init(const void *udata __unused)
 {
 	struct fork_req fr;
 	struct ucred *newcred, *oldcred;
 	struct thread *td;
 	int error;
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
 	fr.fr_procp = &initproc;
 	error = fork1(&thread0, &fr);
 	if (error)
 		panic("cannot fork init: %d\n", error);
 	KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
 	/* divorce init's credentials from the kernel's */
 	newcred = crget();
 	sx_xlock(&proctree_lock);
 	PROC_LOCK(initproc);
 	initproc->p_flag |= P_SYSTEM | P_INMEM;
 	initproc->p_treeflag |= P_TREE_REAPER;
 	LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling);
 	oldcred = initproc->p_ucred;
 	crcopy(newcred, oldcred);
 #ifdef MAC
 	mac_cred_create_init(newcred);
 #endif
 #ifdef AUDIT
 	audit_cred_proc1(newcred);
 #endif
 	proc_set_cred(initproc, newcred);
 	td = FIRST_THREAD_IN_PROC(initproc);
 	crfree(td->td_ucred);
 	td->td_ucred = crhold(initproc->p_ucred);
 	PROC_UNLOCK(initproc);
 	sx_xunlock(&proctree_lock);
 	crfree(oldcred);
 	cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc),
 	    start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
 
 /*
  * Make it runnable now.
  */
 static void
 kick_init(const void *udata __unused)
 {
 	struct thread *td;
 
 	td = FIRST_THREAD_IN_PROC(initproc);
 	thread_lock(td);
 	TD_SET_CAN_RUN(td);
 	sched_add(td, SRQ_BORING);
 	thread_unlock(td);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
Index: projects/bsd_rdma_4_9/sys/kern/kern_exec.c
===================================================================
--- projects/bsd_rdma_4_9/sys/kern/kern_exec.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/kern/kern_exec.c	(revision 326162)
@@ -1,1734 +1,1740 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exec;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, , , exec, "char *");
 SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
 SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 int coredump_pack_fileinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
     &coredump_pack_fileinfo, 0,
     "Enable file path packing in 'procstat -f' coredump notes");
 
 int coredump_pack_vmmapinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
     &coredump_pack_vmmapinfo, 0,
     "Enable file path packing in 'procstat -v' coredump notes");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int disallow_high_osrel;
 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
     &disallow_high_osrel, 0,
     "Disallow execution of binaries built for higher version of the world");
 
 static int map_at_zero = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0,
     "Permit processes to map an object at virtual address 0.");
 
 EVENTHANDLER_LIST_DECLARE(process_exec);
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 sys_execve(struct thread *td, struct execve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fexecve_args {
 	int	fd;
 	char	**argv;
 	char	**envv;
 }
 #endif
 int
 sys_fexecve(struct thread *td, struct fexecve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL);
 	}
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
 {
 #ifdef MAC
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	post_execve(td, error, oldvmspace);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 int
 pre_execve(struct thread *td, struct vmspace **oldvmspace)
 {
 	struct proc *p;
 	int error;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	error = 0;
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		if (thread_single(p, SINGLE_BOUNDARY) != 0)
 			error = ERESTART;
 		PROC_UNLOCK(p);
 	}
 	KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("nested execve"));
 	*oldvmspace = p->p_vmspace;
 	return (error);
 }
 
 void
 post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
 {
 	struct proc *p;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
-		if (error == 0)
+		if (error == EJUSTRETURN)
 			thread_single(p, SINGLE_EXIT);
 		else
 			thread_single_end(p, SINGLE_BOUNDARY);
 		PROC_UNLOCK(p);
 	}
 	if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
 		KASSERT(p->p_vmspace != oldvmspace,
 		    ("oldvmspace still used"));
 		vmspace_free(oldvmspace);
 		td->td_pflags &= ~TDP_EXECVMSPC;
 	}
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
 {
 
 	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	return (do_execve(td, args, mac_p));
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd;
 	struct ucred *oldcred;
 	struct uidinfo *euip = NULL;
 	register_t *stack_base;
 	int error, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *oldtextvp = NULL, *newtextvp;
 	cap_rights_t rights;
 	int credential_changing;
 	int textset;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 	static const char fexecv_proc_title[] = "(fexecv)";
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	bzero(imgp, sizeof(*imgp));
 	imgp->proc = p;
 	imgp->attr = &attr;
 	imgp->args = args;
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp among other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	if (args->fname != NULL) {
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
 		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
 	SDT_PROBE1(proc, , , exec, args->fname);
 
 interpret:
 	if (args->fname != NULL) {
 #ifdef CAPABILITY_MODE
 		/*
 		 * While capability mode can't reach this point via direct
 		 * path arguments to execve(), we also don't allow
 		 * interpreters to be used in capability mode (for now).
 		 * Catch indirect lookups and return a permissions error.
 		 */
 		if (IN_CAPABILITY_MODE(td)) {
 			error = ECAPMODE;
 			goto exec_fail;
 		}
 #endif
 		error = namei(&nd);
 		if (error)
 			goto exec_fail;
 
 		newtextvp = nd.ni_vp;
 		imgp->vp = newtextvp;
 	} else {
 		AUDIT_ARG_FD(args->fd);
 		/*
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
 		error = fgetvp_exec(td, args->fd,
 		    cap_rights_init(&rights, CAP_FEXECVE), &newtextvp);
 		if (error)
 			goto exec_fail;
 		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(newtextvp);
 		imgp->vp = newtextvp;
 	}
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = VOP_IS_TEXT(imgp->vp);
 	VOP_SET_TEXT(imgp->vp);
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Determine new credentials before attempting image activators
 	 * so that it can be used by process_exec handlers to determine
 	 * credential/setid changes.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * We disable setuid/setgid/etc in capability mode on the basis
 	 * that most setugid applications are not written with that
 	 * environment in mind, and will therefore almost certainly operate
 	 * incorrectly. In principle there's no reason that setugid
 	 * applications might not be useful in capability mode, so we may want
 	 * to reconsider this conservative design choice in the future.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & S_ISUID) &&
 	    oldcred->cr_uid != attr.va_uid;
 	credential_changing |= (attr.va_mode & S_ISGID) &&
 	    oldcred->cr_gid != attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interpvplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 #ifdef CAPABILITY_MODE
 	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
 #endif
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		imgp->credential_setid = true;
 		VOP_UNLOCK(imgp->vp, 0);
 		imgp->newcred = crdup(oldcred);
 		if (attr.va_mode & S_ISUID) {
 			euip = uifind(attr.va_uid);
 			change_euid(imgp->newcred, euip);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (attr.va_mode & S_ISGID)
 			change_egid(imgp->newcred, attr.va_gid);
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 	} else {
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			VOP_UNLOCK(imgp->vp, 0);
 			imgp->newcred = crdup(oldcred);
 			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 		}
 	}
 	/* The new credentials are installed into the process later. */
 
 	/*
 	 * Do the best to calculate the full path to the image file.
 	 */
 	if (args->fname != NULL && args->fname[0] == '/')
 		imgp->execpath = args->fname;
 	else {
 		VOP_UNLOCK(imgp->vp, 0);
 		if (vn_fullpath(td, imgp->vp, &imgp->execpath,
 		    &imgp->freepath) != 0)
 			imgp->execpath = args->fname;
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				VOP_UNSET_TEXT(imgp->vp);
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		VOP_UNSET_TEXT(imgp->vp);
 		/* free name buffer and old vnode */
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
 #endif
 		if (imgp->opened) {
 			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
 			imgp->opened = 0;
 		}
 		vput(newtextvp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		imgp->credential_setid = false;
 		if (imgp->newcred != NULL) {
 			crfree(imgp->newcred);
 			imgp->newcred = NULL;
 		}
 		imgp->execpath = NULL;
 		free(imgp->freepath, M_TEMP);
 		imgp->freepath = NULL;
 		/* set new name to that of the interpreter */
 		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		args->fname = imgp->interpreter_name;
 		goto interpret;
 	}
 
 	/*
 	 * NB: We unlock the vnode here because it is believed that none
 	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	if (disallow_high_osrel &&
 	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
 		error = ENOEXEC;
 		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
 		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	/* ABI enforces the use of Capsicum. Switch into capabilities mode. */
 	if (SV_PROC_FLAG(p, SV_CAPSICUM))
 		sys_cap_enter(td, NULL);
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	if (args->fdp != NULL) {
 		/* Install a brand new file descriptor table. */
 		fdinstall_remapped(td, args->fdp);
 		args->fdp = NULL;
 	} else {
 		/*
 		 * Keep on using the existing file descriptor table. For
 		 * security and other reasons, the file descriptor table
 		 * cannot be shared after an exec.
 		 */
 		fdunshare(td);
 		/* close files on exec */
 		fdcloseexec(td);
 	}
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 	}
 
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 
 	PROC_LOCK(p);
 	if (oldsigacts)
 		p->p_sigacts = newsigacts;
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	bzero(p->p_comm, sizeof(p->p_comm));
 	if (args->fname)
 		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
 		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
 	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
 		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
 		p->p_flag2 &= ~P2_NOTRACE;
 	if (p->p_flag & P_PPWAIT) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
 		/* STOPs are no longer ignored, arrange for AST */
 		signotify(td);
 	}
 
 	/*
 	 * Implement image setuid/setgid installation.
 	 */
 	if (imgp->credential_setid) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracecred != NULL &&
 		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
 			ktrprocexec(p, &tracecred, &tracevp);
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * Both fdsetugidsafety() and fdcheckstd() may call functions
 		 * taking sleepable locks, so temporarily drop our locks.
 		 */
 		PROC_UNLOCK(p);
 		VOP_UNLOCK(imgp->vp, 0);
 		fdsetugidsafety(td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto exec_fail_dealloc;
 		PROC_LOCK(p);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, imgp->newcred,
 			    imgp->vp, interpvplabel, imgp);
 		}
 #endif
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 	}
 	/*
 	 * Set the new credentials.
 	 */
 	if (imgp->newcred != NULL) {
 		proc_set_cred(p, imgp->newcred);
 		crfree(oldcred);
 		oldcred = NULL;
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced by namei
 	 * or fgetvp_exec.
 	 */
 	oldtextvp = p->p_textvp;
 	p->p_textvp = newtextvp;
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exec if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exec)
 		dtrace_fasttrap_exec(p);
 #endif
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 	PROC_UNLOCK(p);
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		VOP_UNLOCK(imgp->vp, 0);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp, 
 		    (u_long)(uintptr_t)stack_base);
 	else
 		exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
 
 	vfs_mark_atime(imgp->vp, td->td_ucred);
 
 	SDT_PROBE1(proc, , , exec__success, args->fname);
 
 exec_fail_dealloc:
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		if (args->fname)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
 		if (error != 0)
 			vput(imgp->vp);
 		else
 			VOP_UNLOCK(imgp->vp, 0);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	free(imgp->freepath, M_TEMP);
 
 	if (error == 0) {
 		if (p->p_ptevents & PTRACE_EXEC) {
 			PROC_LOCK(p);
 			if (p->p_ptevents & PTRACE_EXEC)
 				td->td_dbgflags |= TDB_EXEC;
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 	} else {
 exec_fail:
 		/* we're done here, clear P_INEXEC */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_INEXEC;
 		PROC_UNLOCK(p);
 
 		SDT_PROBE1(proc, , , exec__failure, error);
 	}
 
 	if (imgp->newcred != NULL && oldcred != NULL)
 		crfree(imgp->newcred);
 
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
 #endif
 	exec_free_args(args);
 
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (oldtextvp != NULL)
 		vrele(oldtextvp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	pargs_drop(oldargs);
 	pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 	if (euip != NULL)
 		uifree(euip);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, 0, SIGABRT);
 		/* NOT REACHED */
 	}
 
 #ifdef KTRACE
 	if (error == 0)
 		ktrprocctor(p);
 #endif
 
-	return (error);
+	/*
+	 * We don't want cpu_set_syscall_retval() to overwrite any of
+	 * the register values put in place by exec_setregs().
+	 * Implementations of cpu_set_syscall_retval() will leave
+	 * registers unmodified when returning EJUSTRETURN.
+	 */
+	return (error == 0 ? EJUSTRETURN : error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i, after, initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_WLOCK(object);
 #if VM_NRESERVLEVEL > 0
 	vm_object_color(object, 0);
 #endif
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(ma[0]);
 		if (!vm_pager_has_page(object, 0, NULL, &after)) {
 			vm_page_lock(ma[0]);
 			vm_page_free(ma[0]);
 			vm_page_unlock(ma[0]);
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 		initial_pagein = min(after, VM_INITIAL_PAGEIN);
 		KASSERT(initial_pagein <= object->size,
 		    ("%s: initial_pagein %d object->size %ju",
 		    __func__, initial_pagein, (uintmax_t )object->size));
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if (vm_page_tryxbusy(ma[i]))
 					break;
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			for (i = 0; i < initial_pagein; i++) {
 				vm_page_lock(ma[i]);
 				vm_page_free(ma[i]);
 				vm_page_unlock(ma[i]);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 		vm_page_xunbusy(ma[0]);
 		for (i = 1; i < initial_pagein; i++)
 			vm_page_readahead_finish(ma[i]);
 	}
 	vm_page_lock(ma[0]);
 	vm_page_hold(ma[0]);
 	vm_page_activate(ma[0]);
 	vm_page_unlock(ma[0]);
 	VM_OBJECT_WUNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(struct image_params *imgp)
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack.
  *	The new stack is only sgrowsiz large because it is grown
  *	automatically on a page fault.
  */
 int
 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_object_t obj;
 	struct rlimit rlim_stack;
 	vm_offset_t sv_minuser, stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* May be called with Giant held */
 	EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (map_at_zero)
 		sv_minuser = sv->sv_minuser;
 	else
 		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 		/* An exec terminates mlockall(MCL_FUTURE). */
 		vm_map_lock(map);
 		vm_map_modflags(map, 0, MAP_WIREFUTURE);
 		vm_map_unlock(map);
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 		if (error)
 			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Map a shared page */
 	obj = sv->sv_shared_page_obj;
 	if (obj != NULL) {
 		vm_object_reference(obj);
 		error = vm_map_fixed(map, obj, 0,
 		    sv->sv_shared_page_base, sv->sv_shared_page_len,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 		if (error != KERN_SUCCESS) {
 			vm_object_deallocate(obj);
 			return (vm_mmap_to_errno(error));
 		}
 	}
 
 	/* Allocate a new stack */
 	if (imgp->stack_sz != 0) {
 		ssiz = trunc_page(imgp->stack_sz);
 		PROC_LOCK(p);
 		lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
 		PROC_UNLOCK(p);
 		if (ssiz > rlim_stack.rlim_max)
 			ssiz = rlim_stack.rlim_max;
 		if (ssiz > rlim_stack.rlim_cur) {
 			rlim_stack.rlim_cur = ssiz;
 			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
 		}
 	} else if (sv->sv_maxssiz != NULL) {
 		ssiz = *sv->sv_maxssiz;
 	} else {
 		ssiz = maxssiz;
 	}
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error != KERN_SUCCESS)
 		return (vm_mmap_to_errno(error));
 
 	/*
 	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
 	 * are still used to enforce the stack rlimit on the process stack.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)stack_addr;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	u_long argp, envp;
 	int error;
 	size_t length;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	if (fname != NULL) {
 		args->fname = args->buf;
 		error = (segflg == UIO_SYSSPACE) ?
 		    copystr(fname, args->fname, PATH_MAX, &length) :
 		    copyinstr(fname, args->fname, PATH_MAX, &length);
 		if (error != 0)
 			goto err_exit;
 	} else
 		length = 0;
 
 	args->begin_argv = args->buf + length;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	/*
 	 * extract arguments first
 	 */
 	for (;;) {
 		error = fueword(argv++, &argp);
 		if (error == -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if (argp == 0)
 			break;
 		error = copyinstr((void *)(uintptr_t)argp, args->endp,
 		    args->stringspace, &length);
 		if (error != 0) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		for (;;) {
 			error = fueword(envv++, &envp);
 			if (error == -1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if (envp == 0)
 				break;
 			error = copyinstr((void *)(uintptr_t)envp,
 			    args->endp, args->stringspace, &length);
 			if (error != 0) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 exec_copyin_data_fds(struct thread *td, struct image_args *args,
     const void *data, size_t datalen, const int *fds, size_t fdslen)
 {
 	struct filedesc *ofdp;
 	const char *p;
 	int *kfds;
 	int error;
 
 	memset(args, '\0', sizeof(*args));
 	ofdp = td->td_proc->p_fd;
 	if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1)
 		return (E2BIG);
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	args->begin_argv = args->buf;
 	args->stringspace = ARG_MAX;
 
 	if (datalen > 0) {
 		/*
 		 * Argument buffer has been provided. Copy it into the
 		 * kernel as a single string and add a terminating null
 		 * byte.
 		 */
 		error = copyin(data, args->begin_argv, datalen);
 		if (error != 0)
 			goto err_exit;
 		args->begin_argv[datalen] = '\0';
 		args->endp = args->begin_argv + datalen + 1;
 		args->stringspace -= datalen + 1;
 
 		/*
 		 * Traditional argument counting. Count the number of
 		 * null bytes.
 		 */
 		for (p = args->begin_argv; p < args->endp; ++p)
 			if (*p == '\0')
 				++args->argc;
 	} else {
 		/* No argument buffer provided. */
 		args->endp = args->begin_argv;
 	}
 	/* There are no environment variables. */
 	args->begin_envv = args->endp;
 
 	/* Create new file descriptor table. */
 	kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK);
 	error = copyin(fds, kfds, fdslen * sizeof(int));
 	if (error != 0) {
 		free(kfds, M_TEMP);
 		goto err_exit;
 	}
 	error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp);
 	free(kfds, M_TEMP);
 	if (error != 0)
 		goto err_exit;
 
 	return (0);
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 struct exec_args_kva {
 	vm_offset_t addr;
 	u_int gen;
 	SLIST_ENTRY(exec_args_kva) next;
 };
 
 static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva);
 
 static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist;
 static struct mtx exec_args_kva_mtx;
 static u_int exec_args_gen;
 
 static void
 exec_prealloc_args_kva(void *arg __unused)
 {
 	struct exec_args_kva *argkva;
 	u_int i;
 
 	SLIST_INIT(&exec_args_kva_freelist);
 	mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF);
 	for (i = 0; i < exec_map_entries; i++) {
 		argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK);
 		argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size);
 		argkva->gen = exec_args_gen;
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 	}
 }
 SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL);
 
 static vm_offset_t
 exec_alloc_args_kva(void **cookie)
 {
 	struct exec_args_kva *argkva;
 
 	argkva = (void *)atomic_readandclear_ptr(
 	    (uintptr_t *)DPCPU_PTR(exec_args_kva));
 	if (argkva == NULL) {
 		mtx_lock(&exec_args_kva_mtx);
 		while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL)
 			(void)mtx_sleep(&exec_args_kva_freelist,
 			    &exec_args_kva_mtx, 0, "execkva", 0);
 		SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 	*(struct exec_args_kva **)cookie = argkva;
 	return (argkva->addr);
 }
 
 static void
 exec_release_args_kva(struct exec_args_kva *argkva, u_int gen)
 {
 	vm_offset_t base;
 
 	base = argkva->addr;
 	if (argkva->gen != gen) {
 		vm_map_madvise(exec_map, base, base + exec_map_entry_size,
 		    MADV_FREE);
 		argkva->gen = gen;
 	}
 	if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva),
 	    (uintptr_t)NULL, (uintptr_t)argkva)) {
 		mtx_lock(&exec_args_kva_mtx);
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 		wakeup_one(&exec_args_kva_freelist);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 }
 
 static void
 exec_free_args_kva(void *cookie)
 {
 
 	exec_release_args_kva(cookie, exec_args_gen);
 }
 
 static void
 exec_args_kva_lowmem(void *arg __unused)
 {
 	SLIST_HEAD(, exec_args_kva) head;
 	struct exec_args_kva *argkva;
 	u_int gen;
 	int i;
 
 	gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1;
 
 	/*
 	 * Force an madvise of each KVA range. Any currently allocated ranges
 	 * will have MADV_FREE applied once they are freed.
 	 */
 	SLIST_INIT(&head);
 	mtx_lock(&exec_args_kva_mtx);
 	SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva);
 	mtx_unlock(&exec_args_kva_mtx);
 	while ((argkva = SLIST_FIRST(&head)) != NULL) {
 		SLIST_REMOVE_HEAD(&head, next);
 		exec_release_args_kva(argkva, gen);
 	}
 
 	CPU_FOREACH(i) {
 		argkva = (void *)atomic_readandclear_ptr(
 		    (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva));
 		if (argkva != NULL)
 			exec_release_args_kva(argkva, gen);
 	}
 }
 EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL,
     EVENTHANDLER_PRI_ANY);
 
 /*
  * Allocate temporary demand-paged, zero-filled memory for the file name,
  * argument, and environment strings.
  */
 int
 exec_alloc_args(struct image_args *args)
 {
 
 	args->buf = (char *)exec_alloc_args_kva(&args->bufkva);
 	return (0);
 }
 
 void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf != NULL) {
 		exec_free_args_kva(args->bufkva);
 		args->buf = NULL;
 	}
 	if (args->fname_buf != NULL) {
 		free(args->fname_buf, M_TEMP);
 		args->fname_buf = NULL;
 	}
 	if (args->fdp != NULL)
 		fdescfree_remapped(args->fdp);
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp;
 	uintptr_t destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	size_t execpath_len;
 	int szsigcode, szps;
 	char canary[sizeof(long) * 8];
 
 	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_sigcode_base == 0) {
 		if (p->p_sysent->sv_szsigcode != NULL)
 			szsigcode = *(p->p_sysent->sv_szsigcode);
 	}
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0) {
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(void *));
 		copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		destp -= execpath_len;
 		imgp->execpathp = destp;
 		copyout(imgp->execpath, (void *)destp, execpath_len);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = destp;
 	copyout(canary, (void *)destp, sizeof(canary));
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	destp -= szps;
 	destp = rounddown2(destp, sizeof(void *));
 	imgp->pagesizes = destp;
 	copyout(pagesizes, (void *)destp, szps);
 	imgp->pagesizeslen = szps;
 
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(void *));
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size)
 		    * sizeof(char *));
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc
 		    + 2) * sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(struct image_params *imgp)
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error, writecount;
 
 	td = curthread;
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that
 	 *    this file resides on.
 	 * 2) Ensure that at least one execute bit is on. Otherwise, a
 	 *    privileged user will always succeed, and we don't want this
 	 *    to happen unless the file really is executable.
 	 * 3) Ensure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		return (error);
 	if (writecount != 0)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error == 0)
 		imgp->opened = 1;
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: projects/bsd_rdma_4_9/sys/netinet/sctp_ss_functions.c
===================================================================
--- projects/bsd_rdma_4_9/sys/netinet/sctp_ss_functions.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/netinet/sctp_ss_functions.c	(revision 326162)
@@ -1,997 +1,999 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
  * Copyright (c) 2010-2012, by Michael Tuexen. All rights reserved.
  * Copyright (c) 2010-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2010-2012, by Robin Seggelmann. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_pcb.h>
 
 /*
  * Default simple round-robin algorithm.
  * Just interates the streams in the order they appear.
  */
 
 static void
 sctp_ss_default_add(struct sctp_tcb *, struct sctp_association *,
     struct sctp_stream_out *,
     struct sctp_stream_queue_pending *, int);
 
 static void
 sctp_ss_default_remove(struct sctp_tcb *, struct sctp_association *,
     struct sctp_stream_out *,
     struct sctp_stream_queue_pending *, int);
 
 static void
 sctp_ss_default_init(struct sctp_tcb *stcb, struct sctp_association *asoc,
     int holds_lock)
 {
 	uint16_t i;
 
 	asoc->ss_data.locked_on_sending = NULL;
 	asoc->ss_data.last_out_stream = NULL;
 	TAILQ_INIT(&asoc->ss_data.out.wheel);
 	/*
 	 * If there is data in the stream queues already, the scheduler of
 	 * an existing association has been changed. We need to add all
 	 * stream queues to the wheel.
 	 */
 	for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 		stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, &stcb->asoc,
 		    &stcb->asoc.strmout[i],
 		    NULL, holds_lock);
 	}
 	return;
 }
 
 static void
 sctp_ss_default_clear(struct sctp_tcb *stcb, struct sctp_association *asoc,
     int clear_values SCTP_UNUSED, int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	while (!TAILQ_EMPTY(&asoc->ss_data.out.wheel)) {
 		struct sctp_stream_out *strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 
 		TAILQ_REMOVE(&asoc->ss_data.out.wheel, TAILQ_FIRST(&asoc->ss_data.out.wheel), ss_params.rr.next_spoke);
 		strq->ss_params.rr.next_spoke.tqe_next = NULL;
 		strq->ss_params.rr.next_spoke.tqe_prev = NULL;
 	}
 	asoc->ss_data.last_out_stream = NULL;
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static void
 sctp_ss_default_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq)
 {
 	if (with_strq != NULL) {
 		if (stcb->asoc.ss_data.locked_on_sending == with_strq) {
 			stcb->asoc.ss_data.locked_on_sending = strq;
 		}
 		if (stcb->asoc.ss_data.last_out_stream == with_strq) {
 			stcb->asoc.ss_data.last_out_stream = strq;
 		}
 	}
 	strq->ss_params.rr.next_spoke.tqe_next = NULL;
 	strq->ss_params.rr.next_spoke.tqe_prev = NULL;
 	return;
 }
 
 static void
 sctp_ss_default_add(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq,
     struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	/* Add to wheel if not already on it and stream queue not empty */
 	if (!TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.rr.next_spoke.tqe_next == NULL) &&
 	    (strq->ss_params.rr.next_spoke.tqe_prev == NULL)) {
 		TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel,
 		    strq, ss_params.rr.next_spoke);
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static int
 sctp_ss_default_is_empty(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc)
 {
 	if (TAILQ_EMPTY(&asoc->ss_data.out.wheel)) {
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 static void
 sctp_ss_default_remove(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq,
     struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	/*
 	 * Remove from wheel if stream queue is empty and actually is on the
 	 * wheel
 	 */
 	if (TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.rr.next_spoke.tqe_next != NULL ||
 	    strq->ss_params.rr.next_spoke.tqe_prev != NULL)) {
 		if (asoc->ss_data.last_out_stream == strq) {
 			asoc->ss_data.last_out_stream = TAILQ_PREV(asoc->ss_data.last_out_stream,
 			    sctpwheel_listhead,
 			    ss_params.rr.next_spoke);
 			if (asoc->ss_data.last_out_stream == NULL) {
 				asoc->ss_data.last_out_stream = TAILQ_LAST(&asoc->ss_data.out.wheel,
 				    sctpwheel_listhead);
 			}
 			if (asoc->ss_data.last_out_stream == strq) {
 				asoc->ss_data.last_out_stream = NULL;
 			}
 		}
 		TAILQ_REMOVE(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke);
 		strq->ss_params.rr.next_spoke.tqe_next = NULL;
 		strq->ss_params.rr.next_spoke.tqe_prev = NULL;
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 
 static struct sctp_stream_out *
 sctp_ss_default_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net,
     struct sctp_association *asoc)
 {
 	struct sctp_stream_out *strq, *strqt;
 
 	if (asoc->ss_data.locked_on_sending) {
 		return (asoc->ss_data.locked_on_sending);
 	}
 	strqt = asoc->ss_data.last_out_stream;
 default_again:
 	/* Find the next stream to use */
 	if (strqt == NULL) {
 		strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 	} else {
 		strq = TAILQ_NEXT(strqt, ss_params.rr.next_spoke);
 		if (strq == NULL) {
 			strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 		}
 	}
 
 	/*
 	 * If CMT is off, we must validate that the stream in question has
 	 * the first item pointed towards are network destination requested
 	 * by the caller. Note that if we turn out to be locked to a stream
 	 * (assigning TSN's then we must stop, since we cannot look for
 	 * another stream with data to send to that destination). In CMT's
 	 * case, by skipping this check, we will send one data packet
 	 * towards the requested net.
 	 */
 	if (net != NULL && strq != NULL &&
 	    SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) {
 		if (TAILQ_FIRST(&strq->outqueue) &&
 		    TAILQ_FIRST(&strq->outqueue)->net != NULL &&
 		    TAILQ_FIRST(&strq->outqueue)->net != net) {
 			if (strq == asoc->ss_data.last_out_stream) {
 				return (NULL);
 			} else {
 				strqt = strq;
 				goto default_again;
 			}
 		}
 	}
 	return (strq);
 }
 
 static void
 sctp_ss_default_scheduled(struct sctp_tcb *stcb,
     struct sctp_nets *net SCTP_UNUSED,
     struct sctp_association *asoc,
     struct sctp_stream_out *strq,
     int moved_how_much SCTP_UNUSED)
 {
 	struct sctp_stream_queue_pending *sp;
 
 	asoc->ss_data.last_out_stream = strq;
 	if (stcb->asoc.idata_supported == 0) {
 		sp = TAILQ_FIRST(&strq->outqueue);
 		if ((sp != NULL) && (sp->some_taken == 1)) {
 			stcb->asoc.ss_data.locked_on_sending = strq;
 		} else {
 			stcb->asoc.ss_data.locked_on_sending = NULL;
 		}
 	} else {
 		stcb->asoc.ss_data.locked_on_sending = NULL;
 	}
 	return;
 }
 
 static void
 sctp_ss_default_packet_done(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net SCTP_UNUSED,
     struct sctp_association *asoc SCTP_UNUSED)
 {
 	/* Nothing to be done here */
 	return;
 }
 
 static int
 sctp_ss_default_get_value(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED,
     struct sctp_stream_out *strq SCTP_UNUSED, uint16_t *value SCTP_UNUSED)
 {
 	/* Nothing to be done here */
 	return (-1);
 }
 
 static int
 sctp_ss_default_set_value(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED,
     struct sctp_stream_out *strq SCTP_UNUSED, uint16_t value SCTP_UNUSED)
 {
 	/* Nothing to be done here */
 	return (-1);
 }
 
 static int
 sctp_ss_default_is_user_msgs_incomplete(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc)
 {
 	struct sctp_stream_out *strq;
 	struct sctp_stream_queue_pending *sp;
 
 	if (asoc->stream_queue_cnt != 1) {
 		return (0);
 	}
 	strq = asoc->ss_data.locked_on_sending;
 	if (strq == NULL) {
 		return (0);
 	}
 	sp = TAILQ_FIRST(&strq->outqueue);
 	if (sp == NULL) {
 		return (0);
 	}
 	return (!sp->msg_is_complete);
 }
 
 /*
  * Real round-robin algorithm.
  * Always interates the streams in ascending order.
  */
 static void
 sctp_ss_rr_add(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq,
     struct sctp_stream_queue_pending *sp SCTP_UNUSED, int holds_lock)
 {
 	struct sctp_stream_out *strqt;
 
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	if (!TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.rr.next_spoke.tqe_next == NULL) &&
 	    (strq->ss_params.rr.next_spoke.tqe_prev == NULL)) {
 		if (TAILQ_EMPTY(&asoc->ss_data.out.wheel)) {
 			TAILQ_INSERT_HEAD(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke);
 		} else {
 			strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 			while (strqt != NULL && (strqt->sid < strq->sid)) {
 				strqt = TAILQ_NEXT(strqt, ss_params.rr.next_spoke);
 			}
 			if (strqt != NULL) {
 				TAILQ_INSERT_BEFORE(strqt, strq, ss_params.rr.next_spoke);
 			} else {
 				TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.rr.next_spoke);
 			}
 		}
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 /*
  * Real round-robin per packet algorithm.
  * Always interates the streams in ascending order and
  * only fills messages of the same stream in a packet.
  */
 static struct sctp_stream_out *
 sctp_ss_rrp_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net SCTP_UNUSED,
     struct sctp_association *asoc)
 {
 	return (asoc->ss_data.last_out_stream);
 }
 
 static void
 sctp_ss_rrp_packet_done(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net,
     struct sctp_association *asoc)
 {
 	struct sctp_stream_out *strq, *strqt;
 
 	strqt = asoc->ss_data.last_out_stream;
 rrp_again:
 	/* Find the next stream to use */
 	if (strqt == NULL) {
 		strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 	} else {
 		strq = TAILQ_NEXT(strqt, ss_params.rr.next_spoke);
 		if (strq == NULL) {
 			strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 		}
 	}
 
 	/*
 	 * If CMT is off, we must validate that the stream in question has
 	 * the first item pointed towards are network destination requested
 	 * by the caller. Note that if we turn out to be locked to a stream
 	 * (assigning TSN's then we must stop, since we cannot look for
 	 * another stream with data to send to that destination). In CMT's
 	 * case, by skipping this check, we will send one data packet
 	 * towards the requested net.
 	 */
 	if (net != NULL && strq != NULL &&
 	    SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) {
 		if (TAILQ_FIRST(&strq->outqueue) &&
 		    TAILQ_FIRST(&strq->outqueue)->net != NULL &&
 		    TAILQ_FIRST(&strq->outqueue)->net != net) {
 			if (strq == asoc->ss_data.last_out_stream) {
 				strq = NULL;
 			} else {
 				strqt = strq;
 				goto rrp_again;
 			}
 		}
 	}
 	asoc->ss_data.last_out_stream = strq;
 	return;
 }
 
 
 /*
  * Priority algorithm.
  * Always prefers streams based on their priority id.
  */
 static void
 sctp_ss_prio_clear(struct sctp_tcb *stcb, struct sctp_association *asoc,
     int clear_values, int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	while (!TAILQ_EMPTY(&asoc->ss_data.out.wheel)) {
 		struct sctp_stream_out *strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 
 		if (clear_values) {
 			strq->ss_params.prio.priority = 0;
 		}
 		TAILQ_REMOVE(&asoc->ss_data.out.wheel, TAILQ_FIRST(&asoc->ss_data.out.wheel), ss_params.prio.next_spoke);
 		strq->ss_params.prio.next_spoke.tqe_next = NULL;
 		strq->ss_params.prio.next_spoke.tqe_prev = NULL;
 
 	}
 	asoc->ss_data.last_out_stream = NULL;
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static void
 sctp_ss_prio_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq)
 {
 	if (with_strq != NULL) {
 		if (stcb->asoc.ss_data.locked_on_sending == with_strq) {
 			stcb->asoc.ss_data.locked_on_sending = strq;
 		}
 		if (stcb->asoc.ss_data.last_out_stream == with_strq) {
 			stcb->asoc.ss_data.last_out_stream = strq;
 		}
 	}
 	strq->ss_params.prio.next_spoke.tqe_next = NULL;
 	strq->ss_params.prio.next_spoke.tqe_prev = NULL;
 	if (with_strq != NULL) {
 		strq->ss_params.prio.priority = with_strq->ss_params.prio.priority;
 	} else {
 		strq->ss_params.prio.priority = 0;
 	}
 	return;
 }
 
 static void
 sctp_ss_prio_add(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED,
     int holds_lock)
 {
 	struct sctp_stream_out *strqt;
 
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	/* Add to wheel if not already on it and stream queue not empty */
 	if (!TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.prio.next_spoke.tqe_next == NULL) &&
 	    (strq->ss_params.prio.next_spoke.tqe_prev == NULL)) {
 		if (TAILQ_EMPTY(&asoc->ss_data.out.wheel)) {
 			TAILQ_INSERT_HEAD(&asoc->ss_data.out.wheel, strq, ss_params.prio.next_spoke);
 		} else {
 			strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 			while (strqt != NULL && strqt->ss_params.prio.priority < strq->ss_params.prio.priority) {
 				strqt = TAILQ_NEXT(strqt, ss_params.prio.next_spoke);
 			}
 			if (strqt != NULL) {
 				TAILQ_INSERT_BEFORE(strqt, strq, ss_params.prio.next_spoke);
 			} else {
 				TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.prio.next_spoke);
 			}
 		}
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static void
 sctp_ss_prio_remove(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED,
     int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	/*
 	 * Remove from wheel if stream queue is empty and actually is on the
 	 * wheel
 	 */
 	if (TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.prio.next_spoke.tqe_next != NULL ||
 	    strq->ss_params.prio.next_spoke.tqe_prev != NULL)) {
 		if (asoc->ss_data.last_out_stream == strq) {
 			asoc->ss_data.last_out_stream = TAILQ_PREV(asoc->ss_data.last_out_stream, sctpwheel_listhead,
 			    ss_params.prio.next_spoke);
 			if (asoc->ss_data.last_out_stream == NULL) {
 				asoc->ss_data.last_out_stream = TAILQ_LAST(&asoc->ss_data.out.wheel,
 				    sctpwheel_listhead);
 			}
 			if (asoc->ss_data.last_out_stream == strq) {
 				asoc->ss_data.last_out_stream = NULL;
 			}
 		}
 		TAILQ_REMOVE(&asoc->ss_data.out.wheel, strq, ss_params.prio.next_spoke);
 		strq->ss_params.prio.next_spoke.tqe_next = NULL;
 		strq->ss_params.prio.next_spoke.tqe_prev = NULL;
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static struct sctp_stream_out *
 sctp_ss_prio_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net,
     struct sctp_association *asoc)
 {
 	struct sctp_stream_out *strq, *strqt, *strqn;
 
 	strqt = asoc->ss_data.last_out_stream;
 prio_again:
 	/* Find the next stream to use */
 	if (strqt == NULL) {
 		strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 	} else {
 		strqn = TAILQ_NEXT(strqt, ss_params.prio.next_spoke);
 		if (strqn != NULL &&
 		    strqn->ss_params.prio.priority == strqt->ss_params.prio.priority) {
 			strq = strqn;
 		} else {
 			strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 		}
 	}
 
 	/*
 	 * If CMT is off, we must validate that the stream in question has
 	 * the first item pointed towards are network destination requested
 	 * by the caller. Note that if we turn out to be locked to a stream
 	 * (assigning TSN's then we must stop, since we cannot look for
 	 * another stream with data to send to that destination). In CMT's
 	 * case, by skipping this check, we will send one data packet
 	 * towards the requested net.
 	 */
 	if (net != NULL && strq != NULL &&
 	    SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) {
 		if (TAILQ_FIRST(&strq->outqueue) &&
 		    TAILQ_FIRST(&strq->outqueue)->net != NULL &&
 		    TAILQ_FIRST(&strq->outqueue)->net != net) {
 			if (strq == asoc->ss_data.last_out_stream) {
 				return (NULL);
 			} else {
 				strqt = strq;
 				goto prio_again;
 			}
 		}
 	}
 	return (strq);
 }
 
 static int
 sctp_ss_prio_get_value(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc SCTP_UNUSED,
     struct sctp_stream_out *strq, uint16_t *value)
 {
 	if (strq == NULL) {
 		return (-1);
 	}
 	*value = strq->ss_params.prio.priority;
 	return (1);
 }
 
 static int
 sctp_ss_prio_set_value(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq, uint16_t value)
 {
 	if (strq == NULL) {
 		return (-1);
 	}
 	strq->ss_params.prio.priority = value;
 	sctp_ss_prio_remove(stcb, asoc, strq, NULL, 1);
 	sctp_ss_prio_add(stcb, asoc, strq, NULL, 1);
 	return (1);
 }
 
 /*
  * Fair bandwidth algorithm.
  * Maintains an equal troughput per stream.
  */
 static void
 sctp_ss_fb_clear(struct sctp_tcb *stcb, struct sctp_association *asoc,
     int clear_values, int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	while (!TAILQ_EMPTY(&asoc->ss_data.out.wheel)) {
 		struct sctp_stream_out *strq = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 
 		if (clear_values) {
 			strq->ss_params.fb.rounds = -1;
 		}
 		TAILQ_REMOVE(&asoc->ss_data.out.wheel, TAILQ_FIRST(&asoc->ss_data.out.wheel), ss_params.fb.next_spoke);
 		strq->ss_params.fb.next_spoke.tqe_next = NULL;
 		strq->ss_params.fb.next_spoke.tqe_prev = NULL;
 	}
 	asoc->ss_data.last_out_stream = NULL;
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static void
 sctp_ss_fb_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq)
 {
 	if (with_strq != NULL) {
 		if (stcb->asoc.ss_data.locked_on_sending == with_strq) {
 			stcb->asoc.ss_data.locked_on_sending = strq;
 		}
 		if (stcb->asoc.ss_data.last_out_stream == with_strq) {
 			stcb->asoc.ss_data.last_out_stream = strq;
 		}
 	}
 	strq->ss_params.fb.next_spoke.tqe_next = NULL;
 	strq->ss_params.fb.next_spoke.tqe_prev = NULL;
 	if (with_strq != NULL) {
 		strq->ss_params.fb.rounds = with_strq->ss_params.fb.rounds;
 	} else {
 		strq->ss_params.fb.rounds = -1;
 	}
 	return;
 }
 
 static void
 sctp_ss_fb_add(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED,
     int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	if (!TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.fb.next_spoke.tqe_next == NULL) &&
 	    (strq->ss_params.fb.next_spoke.tqe_prev == NULL)) {
 		if (strq->ss_params.fb.rounds < 0)
 			strq->ss_params.fb.rounds = TAILQ_FIRST(&strq->outqueue)->length;
 		TAILQ_INSERT_TAIL(&asoc->ss_data.out.wheel, strq, ss_params.fb.next_spoke);
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static void
 sctp_ss_fb_remove(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp SCTP_UNUSED,
     int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	/*
 	 * Remove from wheel if stream queue is empty and actually is on the
 	 * wheel
 	 */
 	if (TAILQ_EMPTY(&strq->outqueue) &&
 	    (strq->ss_params.fb.next_spoke.tqe_next != NULL ||
 	    strq->ss_params.fb.next_spoke.tqe_prev != NULL)) {
 		if (asoc->ss_data.last_out_stream == strq) {
 			asoc->ss_data.last_out_stream = TAILQ_PREV(asoc->ss_data.last_out_stream, sctpwheel_listhead,
 			    ss_params.fb.next_spoke);
 			if (asoc->ss_data.last_out_stream == NULL) {
 				asoc->ss_data.last_out_stream = TAILQ_LAST(&asoc->ss_data.out.wheel,
 				    sctpwheel_listhead);
 			}
 			if (asoc->ss_data.last_out_stream == strq) {
 				asoc->ss_data.last_out_stream = NULL;
 			}
 		}
 		TAILQ_REMOVE(&asoc->ss_data.out.wheel, strq, ss_params.fb.next_spoke);
 		strq->ss_params.fb.next_spoke.tqe_next = NULL;
 		strq->ss_params.fb.next_spoke.tqe_prev = NULL;
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static struct sctp_stream_out *
 sctp_ss_fb_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net,
     struct sctp_association *asoc)
 {
 	struct sctp_stream_out *strq = NULL, *strqt;
 
 	if (asoc->ss_data.last_out_stream == NULL ||
 	    TAILQ_FIRST(&asoc->ss_data.out.wheel) == TAILQ_LAST(&asoc->ss_data.out.wheel, sctpwheel_listhead)) {
 		strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 	} else {
 		strqt = TAILQ_NEXT(asoc->ss_data.last_out_stream, ss_params.fb.next_spoke);
 	}
 	do {
 		if ((strqt != NULL) &&
 		    ((SCTP_BASE_SYSCTL(sctp_cmt_on_off) > 0) ||
 		    (SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0 &&
 		    (net == NULL || (TAILQ_FIRST(&strqt->outqueue) && TAILQ_FIRST(&strqt->outqueue)->net == NULL) ||
 		    (net != NULL && TAILQ_FIRST(&strqt->outqueue) && TAILQ_FIRST(&strqt->outqueue)->net != NULL &&
 		    TAILQ_FIRST(&strqt->outqueue)->net == net))))) {
 			if ((strqt->ss_params.fb.rounds >= 0) && (strq == NULL ||
 			    strqt->ss_params.fb.rounds < strq->ss_params.fb.rounds)) {
 				strq = strqt;
 			}
 		}
 		if (strqt != NULL) {
 			strqt = TAILQ_NEXT(strqt, ss_params.fb.next_spoke);
 		} else {
 			strqt = TAILQ_FIRST(&asoc->ss_data.out.wheel);
 		}
 	} while (strqt != strq);
 	return (strq);
 }
 
 static void
 sctp_ss_fb_scheduled(struct sctp_tcb *stcb, struct sctp_nets *net SCTP_UNUSED,
     struct sctp_association *asoc, struct sctp_stream_out *strq,
     int moved_how_much SCTP_UNUSED)
 {
 	struct sctp_stream_queue_pending *sp;
 	struct sctp_stream_out *strqt;
 	int subtract;
 
 	if (stcb->asoc.idata_supported == 0) {
 		sp = TAILQ_FIRST(&strq->outqueue);
 		if ((sp != NULL) && (sp->some_taken == 1)) {
 			stcb->asoc.ss_data.locked_on_sending = strq;
 		} else {
 			stcb->asoc.ss_data.locked_on_sending = NULL;
 		}
 	} else {
 		stcb->asoc.ss_data.locked_on_sending = NULL;
 	}
 	subtract = strq->ss_params.fb.rounds;
 	TAILQ_FOREACH(strqt, &asoc->ss_data.out.wheel, ss_params.fb.next_spoke) {
 		strqt->ss_params.fb.rounds -= subtract;
 		if (strqt->ss_params.fb.rounds < 0)
 			strqt->ss_params.fb.rounds = 0;
 	}
 	if (TAILQ_FIRST(&strq->outqueue)) {
 		strq->ss_params.fb.rounds = TAILQ_FIRST(&strq->outqueue)->length;
 	} else {
 		strq->ss_params.fb.rounds = -1;
 	}
 	asoc->ss_data.last_out_stream = strq;
 	return;
 }
 
 /*
  * First-come, first-serve algorithm.
  * Maintains the order provided by the application.
  */
 static void
 sctp_ss_fcfs_add(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp,
     int holds_lock);
 
 static void
 sctp_ss_fcfs_init(struct sctp_tcb *stcb, struct sctp_association *asoc,
     int holds_lock)
 {
 	uint32_t x, n = 0, add_more = 1;
 	struct sctp_stream_queue_pending *sp;
 	uint16_t i;
 
 	TAILQ_INIT(&asoc->ss_data.out.list);
 	/*
 	 * If there is data in the stream queues already, the scheduler of
 	 * an existing association has been changed. We can only cycle
 	 * through the stream queues and add everything to the FCFS queue.
 	 */
 	while (add_more) {
 		add_more = 0;
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			sp = TAILQ_FIRST(&stcb->asoc.strmout[i].outqueue);
 			x = 0;
 			/* Find n. message in current stream queue */
 			while (sp != NULL && x < n) {
 				sp = TAILQ_NEXT(sp, next);
 				x++;
 			}
 			if (sp != NULL) {
 				sctp_ss_fcfs_add(stcb, &stcb->asoc, &stcb->asoc.strmout[i], sp, holds_lock);
 				add_more = 1;
 			}
 		}
 		n++;
 	}
 	return;
 }
 
 static void
 sctp_ss_fcfs_clear(struct sctp_tcb *stcb, struct sctp_association *asoc,
     int clear_values, int holds_lock)
 {
 	if (clear_values) {
 		if (holds_lock == 0) {
 			SCTP_TCB_SEND_LOCK(stcb);
 		}
 		while (!TAILQ_EMPTY(&asoc->ss_data.out.list)) {
 			TAILQ_REMOVE(&asoc->ss_data.out.list, TAILQ_FIRST(&asoc->ss_data.out.list), ss_next);
 		}
 		if (holds_lock == 0) {
 			SCTP_TCB_SEND_UNLOCK(stcb);
 		}
 	}
 	return;
 }
 
 static void
 sctp_ss_fcfs_init_stream(struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq)
 {
 	if (with_strq != NULL) {
 		if (stcb->asoc.ss_data.locked_on_sending == with_strq) {
 			stcb->asoc.ss_data.locked_on_sending = strq;
 		}
 		if (stcb->asoc.ss_data.last_out_stream == with_strq) {
 			stcb->asoc.ss_data.last_out_stream = strq;
 		}
 	}
 	return;
 }
 
 static void
 sctp_ss_fcfs_add(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq SCTP_UNUSED, struct sctp_stream_queue_pending *sp,
     int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	if (sp && (sp->ss_next.tqe_next == NULL) &&
 	    (sp->ss_next.tqe_prev == NULL)) {
 		TAILQ_INSERT_TAIL(&asoc->ss_data.out.list, sp, ss_next);
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 static int
 sctp_ss_fcfs_is_empty(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_association *asoc)
 {
 	if (TAILQ_EMPTY(&asoc->ss_data.out.list)) {
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 static void
 sctp_ss_fcfs_remove(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_stream_out *strq SCTP_UNUSED, struct sctp_stream_queue_pending *sp,
     int holds_lock)
 {
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	if (sp &&
 	    ((sp->ss_next.tqe_next != NULL) ||
 	    (sp->ss_next.tqe_prev != NULL))) {
 		TAILQ_REMOVE(&asoc->ss_data.out.list, sp, ss_next);
 	}
 	if (holds_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return;
 }
 
 
 static struct sctp_stream_out *
 sctp_ss_fcfs_select(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net,
     struct sctp_association *asoc)
 {
 	struct sctp_stream_out *strq;
 	struct sctp_stream_queue_pending *sp;
 
 	sp = TAILQ_FIRST(&asoc->ss_data.out.list);
 default_again:
 	if (sp != NULL) {
 		strq = &asoc->strmout[sp->sid];
 	} else {
 		strq = NULL;
 	}
 
 	/*
 	 * If CMT is off, we must validate that the stream in question has
 	 * the first item pointed towards are network destination requested
 	 * by the caller. Note that if we turn out to be locked to a stream
 	 * (assigning TSN's then we must stop, since we cannot look for
 	 * another stream with data to send to that destination). In CMT's
 	 * case, by skipping this check, we will send one data packet
 	 * towards the requested net.
 	 */
 	if (net != NULL && strq != NULL &&
 	    SCTP_BASE_SYSCTL(sctp_cmt_on_off) == 0) {
 		if (TAILQ_FIRST(&strq->outqueue) &&
 		    TAILQ_FIRST(&strq->outqueue)->net != NULL &&
 		    TAILQ_FIRST(&strq->outqueue)->net != net) {
 			sp = TAILQ_NEXT(sp, ss_next);
 			goto default_again;
 		}
 	}
 	return (strq);
 }
 
 const struct sctp_ss_functions sctp_ss_functions[] = {
 /* SCTP_SS_DEFAULT */
 	{
 		.sctp_ss_init = sctp_ss_default_init,
 		.sctp_ss_clear = sctp_ss_default_clear,
 		.sctp_ss_init_stream = sctp_ss_default_init_stream,
 		.sctp_ss_add_to_stream = sctp_ss_default_add,
 		.sctp_ss_is_empty = sctp_ss_default_is_empty,
 		.sctp_ss_remove_from_stream = sctp_ss_default_remove,
 		.sctp_ss_select_stream = sctp_ss_default_select,
 		.sctp_ss_scheduled = sctp_ss_default_scheduled,
 		.sctp_ss_packet_done = sctp_ss_default_packet_done,
 		.sctp_ss_get_value = sctp_ss_default_get_value,
 		.sctp_ss_set_value = sctp_ss_default_set_value,
 		.sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete
 	},
 /* SCTP_SS_ROUND_ROBIN */
 	{
 		.sctp_ss_init = sctp_ss_default_init,
 		.sctp_ss_clear = sctp_ss_default_clear,
 		.sctp_ss_init_stream = sctp_ss_default_init_stream,
 		.sctp_ss_add_to_stream = sctp_ss_rr_add,
 		.sctp_ss_is_empty = sctp_ss_default_is_empty,
 		.sctp_ss_remove_from_stream = sctp_ss_default_remove,
 		.sctp_ss_select_stream = sctp_ss_default_select,
 		.sctp_ss_scheduled = sctp_ss_default_scheduled,
 		.sctp_ss_packet_done = sctp_ss_default_packet_done,
 		.sctp_ss_get_value = sctp_ss_default_get_value,
 		.sctp_ss_set_value = sctp_ss_default_set_value,
 		.sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete
 	},
 /* SCTP_SS_ROUND_ROBIN_PACKET */
 	{
 		.sctp_ss_init = sctp_ss_default_init,
 		.sctp_ss_clear = sctp_ss_default_clear,
 		.sctp_ss_init_stream = sctp_ss_default_init_stream,
 		.sctp_ss_add_to_stream = sctp_ss_rr_add,
 		.sctp_ss_is_empty = sctp_ss_default_is_empty,
 		.sctp_ss_remove_from_stream = sctp_ss_default_remove,
 		.sctp_ss_select_stream = sctp_ss_rrp_select,
 		.sctp_ss_scheduled = sctp_ss_default_scheduled,
 		.sctp_ss_packet_done = sctp_ss_rrp_packet_done,
 		.sctp_ss_get_value = sctp_ss_default_get_value,
 		.sctp_ss_set_value = sctp_ss_default_set_value,
 		.sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete
 	},
 /* SCTP_SS_PRIORITY */
 	{
 		.sctp_ss_init = sctp_ss_default_init,
 		.sctp_ss_clear = sctp_ss_prio_clear,
 		.sctp_ss_init_stream = sctp_ss_prio_init_stream,
 		.sctp_ss_add_to_stream = sctp_ss_prio_add,
 		.sctp_ss_is_empty = sctp_ss_default_is_empty,
 		.sctp_ss_remove_from_stream = sctp_ss_prio_remove,
 		.sctp_ss_select_stream = sctp_ss_prio_select,
 		.sctp_ss_scheduled = sctp_ss_default_scheduled,
 		.sctp_ss_packet_done = sctp_ss_default_packet_done,
 		.sctp_ss_get_value = sctp_ss_prio_get_value,
 		.sctp_ss_set_value = sctp_ss_prio_set_value,
 		.sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete
 	},
 /* SCTP_SS_FAIR_BANDWITH */
 	{
 		.sctp_ss_init = sctp_ss_default_init,
 		.sctp_ss_clear = sctp_ss_fb_clear,
 		.sctp_ss_init_stream = sctp_ss_fb_init_stream,
 		.sctp_ss_add_to_stream = sctp_ss_fb_add,
 		.sctp_ss_is_empty = sctp_ss_default_is_empty,
 		.sctp_ss_remove_from_stream = sctp_ss_fb_remove,
 		.sctp_ss_select_stream = sctp_ss_fb_select,
 		.sctp_ss_scheduled = sctp_ss_fb_scheduled,
 		.sctp_ss_packet_done = sctp_ss_default_packet_done,
 		.sctp_ss_get_value = sctp_ss_default_get_value,
 		.sctp_ss_set_value = sctp_ss_default_set_value,
 		.sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete
 	},
 /* SCTP_SS_FIRST_COME */
 	{
 		.sctp_ss_init = sctp_ss_fcfs_init,
 		.sctp_ss_clear = sctp_ss_fcfs_clear,
 		.sctp_ss_init_stream = sctp_ss_fcfs_init_stream,
 		.sctp_ss_add_to_stream = sctp_ss_fcfs_add,
 		.sctp_ss_is_empty = sctp_ss_fcfs_is_empty,
 		.sctp_ss_remove_from_stream = sctp_ss_fcfs_remove,
 		.sctp_ss_select_stream = sctp_ss_fcfs_select,
 		.sctp_ss_scheduled = sctp_ss_default_scheduled,
 		.sctp_ss_packet_done = sctp_ss_default_packet_done,
 		.sctp_ss_get_value = sctp_ss_default_get_value,
 		.sctp_ss_set_value = sctp_ss_default_set_value,
 		.sctp_ss_is_user_msgs_incomplete = sctp_ss_default_is_user_msgs_incomplete
 	}
 };
Index: projects/bsd_rdma_4_9/sys/powerpc/powerpc/exec_machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/powerpc/powerpc/exec_machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/powerpc/powerpc/exec_machdep.c	(revision 326162)
@@ -1,1085 +1,1072 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *	$NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_fpu_emu.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 
 #include <machine/altivec.h>
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/pcb.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 
 #ifdef FPU_EMU
 #include <powerpc/fpu/fpu_extern.h>
 #endif
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 typedef struct __ucontext32 {
 	sigset_t		uc_sigmask;
 	mcontext32_t		uc_mcontext;
 	uint32_t		uc_link;
 	struct sigaltstack32    uc_stack;
 	uint32_t		uc_flags;
 	uint32_t		__spare__[4];
 } ucontext32_t;
 
 struct sigframe32 {
 	ucontext32_t		sf_uc;
 	struct siginfo32	sf_si;
 };
 
 static int	grab_mcontext32(struct thread *td, mcontext32_t *, int flags);
 #endif
 
 static int	grab_mcontext(struct thread *, mcontext_t *, int);
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct proc *p;
 	#ifdef COMPAT_FREEBSD32
 	struct siginfo32 siginfo32;
 	struct sigframe32 sf32;
 	#endif
 	size_t sfpsize;
 	caddr_t sfp, usfp;
 	int oonstack, rndfsize;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	oonstack = sigonstack(tf->fixreg[1]);
 
 	/*
 	 * Fill siginfo structure.
 	 */
 	ksi->ksi_info.si_signo = ksi->ksi_signo;
 	ksi->ksi_info.si_addr = (void *)((tf->exc == EXC_DSI) ? 
 	    tf->dar : tf->srr0);
 
 	#ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(p, SV_ILP32)) {
 		siginfo_to_siginfo32(&ksi->ksi_info, &siginfo32);
 		sig = siginfo32.si_signo;
 		code = siginfo32.si_code;
 		sfp = (caddr_t)&sf32;
 		sfpsize = sizeof(sf32);
 		rndfsize = roundup(sizeof(sf32), 16);
 
 		/*
 		 * Save user context
 		 */
 
 		memset(&sf32, 0, sizeof(sf32));
 		grab_mcontext32(td, &sf32.sf_uc.uc_mcontext, 0);
 
 		sf32.sf_uc.uc_sigmask = *mask;
 		sf32.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
 		sf32.sf_uc.uc_stack.ss_size = (uint32_t)td->td_sigstk.ss_size;
 		sf32.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 		sf32.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	} else {
 	#endif
 		sig = ksi->ksi_signo;
 		code = ksi->ksi_code;
 		sfp = (caddr_t)&sf;
 		sfpsize = sizeof(sf);
 		#ifdef __powerpc64__
 		/*
 		 * 64-bit PPC defines a 288 byte scratch region
 		 * below the stack.
 		 */
 		rndfsize = 288 + roundup(sizeof(sf), 48);
 		#else
 		rndfsize = roundup(sizeof(sf), 16);
 		#endif
 
 		/*
 		 * Save user context
 		 */
 
 		memset(&sf, 0, sizeof(sf));
 		grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 
 		sf.sf_uc.uc_sigmask = *mask;
 		sf.sf_uc.uc_stack = td->td_sigstk;
 		sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 		sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	#ifdef COMPAT_FREEBSD32
 	}
 	#endif
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	     catcher, sig);
 
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		usfp = (void *)(((uintptr_t)td->td_sigstk.ss_sp +
 		   td->td_sigstk.ss_size - rndfsize) & ~0xFul);
 	} else {
 		usfp = (void *)((tf->fixreg[1] - rndfsize) & ~0xFul);
 	}
 
 	/*
 	 * Save the floating-point state, if necessary, then copy it.
 	 */
 	/* XXX */
 
 	/*
 	 * Set up the registers to return to sigcode.
 	 *
 	 *   r1/sp - sigframe ptr
 	 *   lr    - sig function, dispatched to by blrl in trampoline
 	 *   r3    - sig number
 	 *   r4    - SIGINFO ? &siginfo : exception code
 	 *   r5    - user context
 	 *   srr0  - trampoline function addr
 	 */
 	tf->lr = (register_t)catcher;
 	tf->fixreg[1] = (register_t)usfp;
 	tf->fixreg[FIRSTARG] = sig;
 	#ifdef COMPAT_FREEBSD32
 	tf->fixreg[FIRSTARG+2] = (register_t)usfp +
 	    ((SV_PROC_FLAG(p, SV_ILP32)) ?
 	    offsetof(struct sigframe32, sf_uc) :
 	    offsetof(struct sigframe, sf_uc));
 	#else
 	tf->fixreg[FIRSTARG+2] = (register_t)usfp +
 	    offsetof(struct sigframe, sf_uc);
 	#endif
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/*
 		 * Signal handler installed with SA_SIGINFO.
 		 */
 		#ifdef COMPAT_FREEBSD32
 		if (SV_PROC_FLAG(p, SV_ILP32)) {
 			sf32.sf_si = siginfo32;
 			tf->fixreg[FIRSTARG+1] = (register_t)usfp +
 			    offsetof(struct sigframe32, sf_si);
 			sf32.sf_si = siginfo32;
 		} else  {
 		#endif
 			tf->fixreg[FIRSTARG+1] = (register_t)usfp +
 			    offsetof(struct sigframe, sf_si);
 			sf.sf_si = ksi->ksi_info;
 		#ifdef COMPAT_FREEBSD32
 		}
 		#endif
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->fixreg[FIRSTARG+1] = code;
 		tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? 
 		    tf->dar : tf->srr0;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	tf->srr0 = (register_t)p->p_sysent->sv_sigcode_base;
 
 	/*
 	 * copy the frame out to userland.
 	 */
 	if (copyout(sfp, usfp, sfpsize) != 0) {
 		/*
 		 * Process has trashed its stack. Kill it.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
 	     tf->srr0, tf->fixreg[1]);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	ucontext_t uc;
 	int error;
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
 	     td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_lr = tf->srr0;
 	pcb->pcb_sp = tf->fixreg[1];
 }
 
 /*
  * get_mcontext/sendsig helper routine that doesn't touch the
  * proc lock
  */
 static int
 grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	int i;
 
 	pcb = td->td_pcb;
 
 	memset(mcp, 0, sizeof(mcontext_t));
 
 	mcp->mc_vers = _MC_VERSION;
 	mcp->mc_flags = 0;
 	memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_gpr[3] = 0;
 		mcp->mc_gpr[4] = 0;
 	}
 
 	/*
 	 * This assumes that floating-point context is *not* lazy,
 	 * so if the thread has used FP there would have been a
 	 * FP-unavailable exception that would have set things up
 	 * correctly.
 	 */
 	if (pcb->pcb_flags & PCB_FPREGS) {
 		if (pcb->pcb_flags & PCB_FPU) {
 			KASSERT(td == curthread,
 				("get_mcontext: fp save not curthread"));
 			critical_enter();
 			save_fpu(td);
 			critical_exit();
 		}
 		mcp->mc_flags |= _MC_FP_VALID;
 		memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		for (i = 0; i < 32; i++)
 			memcpy(&mcp->mc_fpreg[i], &pcb->pcb_fpu.fpr[i].fpr,
 			    sizeof(double));
 	}
 
 	if (pcb->pcb_flags & PCB_VSX) {
 		for (i = 0; i < 32; i++)
 			memcpy(&mcp->mc_vsxfpreg[i],
 			    &pcb->pcb_fpu.fpr[i].vsr[2], sizeof(double));
 	}
 
 	/*
 	 * Repeat for Altivec context
 	 */
 
 	if (pcb->pcb_flags & PCB_VEC) {
 		KASSERT(td == curthread,
 			("get_mcontext: fp save not curthread"));
 		critical_enter();
 		save_vec(td);
 		critical_exit();
 		mcp->mc_flags |= _MC_AV_VALID;
 		mcp->mc_vscr  = pcb->pcb_vec.vscr;
 		mcp->mc_vrsave =  pcb->pcb_vec.vrsave;
 		memcpy(mcp->mc_avec, pcb->pcb_vec.vr, sizeof(mcp->mc_avec));
 	}
 
 	mcp->mc_len = sizeof(*mcp);
 
 	return (0);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	int error;
 
 	error = grab_mcontext(td, mcp, flags);
 	if (error == 0) {
 		PROC_LOCK(curthread->td_proc);
 		mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
 		PROC_UNLOCK(curthread->td_proc);
 	}
 
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tf;
 	register_t tls;
 	int i;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 
 	if (mcp->mc_vers != _MC_VERSION || mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 
 	/*
 	 * Don't let the user set privileged MSR bits
 	 */
 	if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
 		return (EINVAL);
 	}
 
 	/* Copy trapframe, preserving TLS pointer across context change */
 	if (SV_PROC_FLAG(td->td_proc, SV_LP64))
 		tls = tf->fixreg[13];
 	else
 		tls = tf->fixreg[2];
 	memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));
 	if (SV_PROC_FLAG(td->td_proc, SV_LP64))
 		tf->fixreg[13] = tls;
 	else
 		tf->fixreg[2] = tls;
 
 	if (mcp->mc_flags & _MC_FP_VALID) {
 		/* enable_fpu() will happen lazily on a fault */
 		pcb->pcb_flags |= PCB_FPREGS;
 		memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
 		bzero(pcb->pcb_fpu.fpr, sizeof(pcb->pcb_fpu.fpr));
 		for (i = 0; i < 32; i++) {
 			memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i],
 			    sizeof(double));
 			memcpy(&pcb->pcb_fpu.fpr[i].vsr[2],
 			    &mcp->mc_vsxfpreg[i], sizeof(double));
 		}
 	}
 
 	if (mcp->mc_flags & _MC_AV_VALID) {
 		if ((pcb->pcb_flags & PCB_VEC) != PCB_VEC) {
 			critical_enter();
 			enable_vec(td);
 			critical_exit();
 		}
 		pcb->pcb_vec.vscr = mcp->mc_vscr;
 		pcb->pcb_vec.vrsave = mcp->mc_vrsave;
 		memcpy(pcb->pcb_vec.vr, mcp->mc_avec, sizeof(mcp->mc_avec));
 	}
 
 	return (0);
 }
 
 /*
  * Set set up registers on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe	*tf;
 	register_t		argc;
 
 	tf = trapframe(td);
 	bzero(tf, sizeof *tf);
 	#ifdef __powerpc64__
 	tf->fixreg[1] = -roundup(-stack + 48, 16);
 	#else
 	tf->fixreg[1] = -roundup(-stack + 8, 16);
 	#endif
 
 	/*
 	 * Set up arguments for _start():
 	 *	_start(argc, argv, envp, obj, cleanup, ps_strings);
 	 *
 	 * Notes:
 	 *	- obj and cleanup are the auxilliary and termination
 	 *	  vectors.  They are fixed up by ld.elf_so.
 	 *	- ps_strings is a NetBSD extention, and will be
 	 * 	  ignored by executables which are strictly
 	 *	  compliant with the SVR4 ABI.
-	 *
-	 * XXX We have to set both regs and retval here due to different
-	 * XXX calling convention in trap.c and init_main.c.
 	 */
 
 	/* Collect argc from the user stack */
 	argc = fuword((void *)stack);
 
-        /*
-         * XXX PG: these get overwritten in the syscall return code.
-         * execve() should return EJUSTRETURN, like it does on NetBSD.
-         * Emulate by setting the syscall return value cells. The
-         * registers still have to be set for init's fork trampoline.
-         */
-        td->td_retval[0] = argc;
-        td->td_retval[1] = stack + sizeof(register_t);
 	tf->fixreg[3] = argc;
 	tf->fixreg[4] = stack + sizeof(register_t);
 	tf->fixreg[5] = stack + (2 + argc)*sizeof(register_t);
 	tf->fixreg[6] = 0;				/* auxillary vector */
 	tf->fixreg[7] = 0;				/* termination vector */
 	tf->fixreg[8] = (register_t)imgp->ps_strings;	/* NetBSD extension */
 
 	tf->srr0 = imgp->entry_addr;
 	#ifdef __powerpc64__
 	tf->fixreg[12] = imgp->entry_addr;
 	#ifdef AIM
 	tf->srr1 = PSL_SF | PSL_USERSET | PSL_FE_DFLT;
 	if (mfmsr() & PSL_HV)
 		tf->srr1 |= PSL_HV;
 	#elif defined(BOOKE)
 	tf->srr1 = PSL_CM | PSL_USERSET | PSL_FE_DFLT;
 	#endif
 	#else
 	tf->srr1 = PSL_USERSET | PSL_FE_DFLT;
 	#endif
 	td->td_pcb->pcb_flags = 0;
 }
 
 #ifdef COMPAT_FREEBSD32
 void
 ppc32_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe	*tf;
 	uint32_t		argc;
 
 	tf = trapframe(td);
 	bzero(tf, sizeof *tf);
 	tf->fixreg[1] = -roundup(-stack + 8, 16);
 
 	argc = fuword32((void *)stack);
 
-        td->td_retval[0] = argc;
-        td->td_retval[1] = stack + sizeof(uint32_t);
 	tf->fixreg[3] = argc;
 	tf->fixreg[4] = stack + sizeof(uint32_t);
 	tf->fixreg[5] = stack + (2 + argc)*sizeof(uint32_t);
 	tf->fixreg[6] = 0;				/* auxillary vector */
 	tf->fixreg[7] = 0;				/* termination vector */
 	tf->fixreg[8] = (register_t)imgp->ps_strings;	/* NetBSD extension */
 
 	tf->srr0 = imgp->entry_addr;
 	tf->srr1 = PSL_USERSET | PSL_FE_DFLT;
 #ifdef AIM
 	tf->srr1 &= ~PSL_SF;
 	if (mfmsr() & PSL_HV)
 		tf->srr1 |= PSL_HV;
 #elif defined(BOOKE)
 	tf->srr1 &= ~PSL_CM;
 #endif
 	td->td_pcb->pcb_flags = 0;
 }
 #endif
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(regs, tf, sizeof(struct reg));
 
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPREGS) == 0)
 		memset(fpregs, 0, sizeof(struct fpreg));
 	else {
 		memcpy(&fpregs->fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		for (i = 0; i < 32; i++)
 			memcpy(&fpregs->fpreg[i], &pcb->pcb_fpu.fpr[i].fpr,
 			    sizeof(double));
 	}
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(tf, regs, sizeof(struct reg));
 	
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	pcb = td->td_pcb;
 	pcb->pcb_flags |= PCB_FPREGS;
 	memcpy(&pcb->pcb_fpu.fpscr, &fpregs->fpscr, sizeof(double));
 	for (i = 0; i < 32; i++) {
 		memcpy(&pcb->pcb_fpu.fpr[i].fpr, &fpregs->fpreg[i],
 		    sizeof(double));
 	}
 
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD32
 int
 set_regs32(struct thread *td, struct reg32 *regs)
 {
 	struct trapframe *tf;
 	int i;
 
 	tf = td->td_frame;
 	for (i = 0; i < 32; i++)
 		tf->fixreg[i] = regs->fixreg[i];
 	tf->lr = regs->lr;
 	tf->cr = regs->cr;
 	tf->xer = regs->xer;
 	tf->ctr = regs->ctr;
 	tf->srr0 = regs->pc;
 
 	return (0);
 }
 
 int
 fill_regs32(struct thread *td, struct reg32 *regs)
 {
 	struct trapframe *tf;
 	int i;
 
 	tf = td->td_frame;
 	for (i = 0; i < 32; i++)
 		regs->fixreg[i] = tf->fixreg[i];
 	regs->lr = tf->lr;
 	regs->cr = tf->cr;
 	regs->xer = tf->xer;
 	regs->ctr = tf->ctr;
 	regs->pc = tf->srr0;
 
 	return (0);
 }
 
 static int
 grab_mcontext32(struct thread *td, mcontext32_t *mcp, int flags)
 {
 	mcontext_t mcp64;
 	int i, error;
 
 	error = grab_mcontext(td, &mcp64, flags);
 	if (error != 0)
 		return (error);
 	
 	mcp->mc_vers = mcp64.mc_vers;
 	mcp->mc_flags = mcp64.mc_flags;
 	mcp->mc_onstack = mcp64.mc_onstack;
 	mcp->mc_len = mcp64.mc_len;
 	memcpy(mcp->mc_avec,mcp64.mc_avec,sizeof(mcp64.mc_avec));
 	memcpy(mcp->mc_av,mcp64.mc_av,sizeof(mcp64.mc_av));
 	for (i = 0; i < 42; i++)
 		mcp->mc_frame[i] = mcp64.mc_frame[i];
 	memcpy(mcp->mc_fpreg,mcp64.mc_fpreg,sizeof(mcp64.mc_fpreg));
 	memcpy(mcp->mc_vsxfpreg,mcp64.mc_vsxfpreg,sizeof(mcp64.mc_vsxfpreg));
 
 	return (0);
 }
 
 static int
 get_mcontext32(struct thread *td, mcontext32_t *mcp, int flags)
 {
 	int error;
 
 	error = grab_mcontext32(td, mcp, flags);
 	if (error == 0) {
 		PROC_LOCK(curthread->td_proc);
 		mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
 		PROC_UNLOCK(curthread->td_proc);
 	}
 
 	return (error);
 }
 
 static int
 set_mcontext32(struct thread *td, mcontext32_t *mcp)
 {
 	mcontext_t mcp64;
 	int i, error;
 
 	mcp64.mc_vers = mcp->mc_vers;
 	mcp64.mc_flags = mcp->mc_flags;
 	mcp64.mc_onstack = mcp->mc_onstack;
 	mcp64.mc_len = mcp->mc_len;
 	memcpy(mcp64.mc_avec,mcp->mc_avec,sizeof(mcp64.mc_avec));
 	memcpy(mcp64.mc_av,mcp->mc_av,sizeof(mcp64.mc_av));
 	for (i = 0; i < 42; i++)
 		mcp64.mc_frame[i] = mcp->mc_frame[i];
 	mcp64.mc_srr1 |= (td->td_frame->srr1 & 0xFFFFFFFF00000000ULL);
 	memcpy(mcp64.mc_fpreg,mcp->mc_fpreg,sizeof(mcp64.mc_fpreg));
 	memcpy(mcp64.mc_vsxfpreg,mcp->mc_vsxfpreg,sizeof(mcp64.mc_vsxfpreg));
 
 	error = set_mcontext(td, &mcp64);
 
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD32
 int
 freebsd32_sigreturn(struct thread *td, struct freebsd32_sigreturn_args *uap)
 {
 	ucontext32_t uc;
 	int error;
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	error = set_mcontext32(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
 	     td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * The first two fields of a ucontext_t are the signal mask and the machine
  * context.  The next field is uc_link; we want to avoid destroying the link
  * when copying out contexts.
  */
 #define	UC32_COPY_SIZE	offsetof(ucontext32_t, uc_link)
 
 int
 freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap)
 {
 	ucontext32_t uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->ucp, UC32_COPY_SIZE);
 	}
 	return (ret);
 }
 
 int
 freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
 {
 	ucontext32_t uc;
 	int ret;	
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE);
 		if (ret == 0) {
 			ret = set_mcontext32(td, &uc.uc_mcontext);
 			if (ret == 0) {
 				kern_sigprocmask(td, SIG_SETMASK,
 				    &uc.uc_sigmask, NULL, 0);
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 int
 freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
 {
 	ucontext32_t uc;
 	int ret;
 
 	if (uap->oucp == NULL || uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->oucp, UC32_COPY_SIZE);
 		if (ret == 0) {
 			ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE);
 			if (ret == 0) {
 				ret = set_mcontext32(td, &uc.uc_mcontext);
 				if (ret == 0) {
 					kern_sigprocmask(td, SIG_SETMASK,
 					    &uc.uc_sigmask, NULL, 0);
 				}
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 #endif
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 	struct proc *p;
 	struct trapframe *tf;
 	int fixup;
 
 	if (error == EJUSTRETURN)
 		return;
 
 	p = td->td_proc;
 	tf = td->td_frame;
 
 	if (tf->fixreg[0] == SYS___syscall &&
 	    (SV_PROC_FLAG(p, SV_ILP32))) {
 		int code = tf->fixreg[FIRSTARG + 1];
 		if (p->p_sysent->sv_mask)
 			code &= p->p_sysent->sv_mask;
 		fixup = (
 #if defined(COMPAT_FREEBSD6) && defined(SYS_freebsd6_lseek)
 		    code != SYS_freebsd6_lseek &&
 #endif
 		    code != SYS_lseek) ?  1 : 0;
 	} else
 		fixup = 0;
 
 	switch (error) {
 	case 0:
 		if (fixup) {
 			/*
 			 * 64-bit return, 32-bit syscall. Fixup byte order
 			 */
 			tf->fixreg[FIRSTARG] = 0;
 			tf->fixreg[FIRSTARG + 1] = td->td_retval[0];
 		} else {
 			tf->fixreg[FIRSTARG] = td->td_retval[0];
 			tf->fixreg[FIRSTARG + 1] = td->td_retval[1];
 		}
 		tf->cr &= ~0x10000000;		/* Unset summary overflow */
 		break;
 	case ERESTART:
 		/*
 		 * Set user's pc back to redo the system call.
 		 */
 		tf->srr0 -= 4;
 		break;
 	default:
 		tf->fixreg[FIRSTARG] = SV_ABI_ERRNO(p, error);
 		tf->cr |= 0x10000000;		/* Set summary overflow */
 		break;
 	}
 }
 
 /*
  * Threading functions
  */
 void
 cpu_thread_exit(struct thread *td)
 {
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = (struct pcb *)((td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    sizeof(struct pcb)) & ~0x2fUL);
 	td->td_pcb = pcb;
 	td->td_frame = (struct trapframe *)pcb - 1;
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 
 	if (SV_PROC_FLAG(td->td_proc, SV_LP64))
 		td->td_frame->fixreg[13] = (register_t)tls_base + 0x7010;
 	else
 		td->td_frame->fixreg[2] = (register_t)tls_base + 0x7008;
 	return (0);
 }
 
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	struct pcb *pcb2;
 	struct trapframe *tf;
 	struct callframe *cf;
 
 	pcb2 = td->td_pcb;
 
 	/* Copy the upcall pcb */
 	bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Create a stack for the new thread */
 	tf = td->td_frame;
 	bcopy(td0->td_frame, tf, sizeof(struct trapframe));
 	tf->fixreg[FIRSTARG] = 0;
 	tf->fixreg[FIRSTARG + 1] = 0;
 	tf->cr &= ~0x10000000;
 
 	/* Set registers for trampoline to user mode. */
 	cf = (struct callframe *)tf - 1;
 	memset(cf, 0, sizeof(struct callframe));
 	cf->cf_func = (register_t)fork_return;
 	cf->cf_arg0 = (register_t)td;
 	cf->cf_arg1 = (register_t)tf;
 
 	pcb2->pcb_sp = (register_t)cf;
 	#if defined(__powerpc64__) && (!defined(_CALL_ELF) || _CALL_ELF == 1)
 	pcb2->pcb_lr = ((register_t *)fork_trampoline)[0];
 	pcb2->pcb_toc = ((register_t *)fork_trampoline)[1];
 	#else
 	pcb2->pcb_lr = (register_t)fork_trampoline;
 	pcb2->pcb_context[0] = pcb2->pcb_lr;
 	#endif
 	pcb2->pcb_cpu.aim.usr_vsid = 0;
 
 	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_msr = PSL_KERNSET;
 }
 
 void
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
 	struct trapframe *tf;
 	uintptr_t sp;
 
 	tf = td->td_frame;
 	/* align stack and alloc space for frame ptr and saved LR */
 	#ifdef __powerpc64__
 	sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 48) &
 	    ~0x1f;
 	#else
 	sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 8) &
 	    ~0x1f;
 	#endif
 	bzero(tf, sizeof(struct trapframe));
 
 	tf->fixreg[1] = (register_t)sp;
 	tf->fixreg[3] = (register_t)arg;
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		tf->srr0 = (register_t)entry;
 		tf->srr1 = PSL_USERSET | PSL_FE_DFLT;
 		#ifdef __powerpc64__
 		tf->srr1 &= ~PSL_SF;
 		#endif
 	} else {
 	    #ifdef __powerpc64__
 		register_t entry_desc[3];
 		(void)copyin((void *)entry, entry_desc, sizeof(entry_desc));
 		tf->srr0 = entry_desc[0];
 		tf->fixreg[2] = entry_desc[1];
 		tf->fixreg[11] = entry_desc[2];
 		tf->srr1 = PSL_SF | PSL_USERSET | PSL_FE_DFLT;
 	    #endif
 	}
 
 	#ifdef __powerpc64__
 	if (mfmsr() & PSL_HV)
 		tf->srr1 |= PSL_HV;
 	#endif
 	td->td_pcb->pcb_flags = 0;
 
 	td->td_retval[0] = (register_t)entry;
 	td->td_retval[1] = 0;
 }
 
 int
 ppc_instr_emulate(struct trapframe *frame, struct pcb *pcb)
 {
 	uint32_t instr;
 	int reg, sig;
 
 	instr = fuword32((void *)frame->srr0);
 	sig = SIGILL;
 
 	if ((instr & 0xfc1fffff) == 0x7c1f42a6) {	/* mfpvr */
 		reg = (instr & ~0xfc1fffff) >> 21;
 		frame->fixreg[reg] = mfpvr();
 		frame->srr0 += 4;
 		return (0);
 	}
 
 	if ((instr & 0xfc000ffe) == 0x7c0004ac) {	/* various sync */
 		powerpc_sync(); /* Do a heavy-weight sync */
 		frame->srr0 += 4;
 		return (0);
 	}
 
 #ifdef FPU_EMU
 	if (!(pcb->pcb_flags & PCB_FPREGS)) {
 		bzero(&pcb->pcb_fpu, sizeof(pcb->pcb_fpu));
 		pcb->pcb_flags |= PCB_FPREGS;
 	}
 	sig = fpu_emulate(frame, &pcb->pcb_fpu);
 #endif
 
 	return (sig);
 }
 
Index: projects/bsd_rdma_4_9/sys/riscv/riscv/machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/riscv/riscv/machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/riscv/riscv/machdep.c	(revision 326162)
@@ -1,894 +1,889 @@
 /*-
  * Copyright (c) 2014 Andrew Turner
  * Copyright (c) 2015-2017 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * Portions of this software were developed by SRI International and the
  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Portions of this software were developed by the University of Cambridge
  * Computer Laboratory as part of the CTSRD Project, with support from the
  * UK Higher Education Innovation Fund (HEIF).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/msgbuf.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 
 #include <machine/riscvreg.h>
 #include <machine/cpu.h>
 #include <machine/kdb.h>
 #include <machine/machdep.h>
 #include <machine/pcb.h>
 #include <machine/reg.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 #include <machine/intr.h>
 #include <machine/sbi.h>
 
 #include <machine/asm.h>
 
 #ifdef FPE
 #include <machine/fpe.h>
 #endif
 
 #ifdef FDT
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 #endif
 
 struct pcpu __pcpu[MAXCPU];
 
 static struct trapframe proc0_tf;
 
 vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2];
 vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2];
 
 int early_boot = 1;
 int cold = 1;
 long realmem = 0;
 long Maxmem = 0;
 
 #define	DTB_SIZE_MAX	(1024 * 1024)
 
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 vm_paddr_t physmap[PHYSMAP_SIZE];
 u_int physmap_idx;
 
 struct kva_md_info kmi;
 
 int64_t dcache_line_size;	/* The minimum D cache line size */
 int64_t icache_line_size;	/* The minimum I cache line size */
 int64_t idcache_line_size;	/* The minimum cache line size */
 
 extern int *end;
 extern int *initstack_end;
 
 struct pcpu *pcpup;
 
 uintptr_t mcall_trap(uintptr_t mcause, uintptr_t* regs);
 
 uintptr_t
 mcall_trap(uintptr_t mcause, uintptr_t* regs)
 {
 
 	return (0);
 }
 
 static void
 cpu_startup(void *dummy)
 {
 
 	identify_cpu();
 
 	vm_ksubmap_init(&kmi);
 	bufinit();
 	vm_pager_bufferinit();
 }
 
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 int
 cpu_idle_wakeup(int cpu)
 {
 
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	regs->sepc = frame->tf_sepc;
 	regs->sstatus = frame->tf_sstatus;
 	regs->ra = frame->tf_ra;
 	regs->sp = frame->tf_sp;
 	regs->gp = frame->tf_gp;
 	regs->tp = frame->tf_tp;
 
 	memcpy(regs->t, frame->tf_t, sizeof(regs->t));
 	memcpy(regs->s, frame->tf_s, sizeof(regs->s));
 	memcpy(regs->a, frame->tf_a, sizeof(regs->a));
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	frame->tf_sepc = regs->sepc;
 	frame->tf_sstatus = regs->sstatus;
 	frame->tf_ra = regs->ra;
 	frame->tf_sp = regs->sp;
 	frame->tf_gp = regs->gp;
 	frame->tf_tp = regs->tp;
 
 	memcpy(frame->tf_t, regs->t, sizeof(frame->tf_t));
 	memcpy(frame->tf_s, regs->s, sizeof(frame->tf_s));
 	memcpy(frame->tf_a, regs->a, sizeof(frame->tf_a));
 
 	return (0);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *regs)
 {
 #ifdef FPE
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
 		/*
 		 * If we have just been running FPE instructions we will
 		 * need to save the state to memcpy it below.
 		 */
 		fpe_state_save(td);
 
 		memcpy(regs->fp_x, pcb->pcb_x, sizeof(regs->fp_x));
 		regs->fp_fcsr = pcb->pcb_fcsr;
 	} else
 #endif
 		memset(regs->fp_x, 0, sizeof(regs->fp_x));
 
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *regs)
 {
 #ifdef FPE
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	memcpy(pcb->pcb_x, regs->fp_x, sizeof(regs->fp_x));
 	pcb->pcb_fcsr = regs->fp_fcsr;
 #endif
 
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *regs)
 {
 
 	panic("fill_dbregs");
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *regs)
 {
 
 	panic("set_dbregs");
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	panic("ptrace_set_pc");
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 
 	memset(tf, 0, sizeof(struct trapframe));
 
-	/*
-	 * We need to set a0 for init as it doesn't call
-	 * cpu_set_syscall_retval to copy the value. We also
-	 * need to set td_retval for the cases where we do.
-	 */
-	tf->tf_a[0] = td->td_retval[0] = stack;
+	tf->tf_a[0] = stack;
 	tf->tf_sp = STACKALIGN(stack);
 	tf->tf_ra = imgp->entry_addr;
 	tf->tf_sepc = imgp->entry_addr;
 
 	pcb->pcb_fpflags &= ~PCB_FP_STARTED;
 }
 
 /* Sanity check these are the same size, they will be memcpy'd to and fro */
 CTASSERT(sizeof(((struct trapframe *)0)->tf_a) ==
     sizeof((struct gpregs *)0)->gp_a);
 CTASSERT(sizeof(((struct trapframe *)0)->tf_s) ==
     sizeof((struct gpregs *)0)->gp_s);
 CTASSERT(sizeof(((struct trapframe *)0)->tf_t) ==
     sizeof((struct gpregs *)0)->gp_t);
 CTASSERT(sizeof(((struct trapframe *)0)->tf_a) ==
     sizeof((struct reg *)0)->a);
 CTASSERT(sizeof(((struct trapframe *)0)->tf_s) ==
     sizeof((struct reg *)0)->s);
 CTASSERT(sizeof(((struct trapframe *)0)->tf_t) ==
     sizeof((struct reg *)0)->t);
 
 /* Support for FDT configurations only. */
 CTASSERT(FDT);
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
 {
 	struct trapframe *tf = td->td_frame;
 
 	memcpy(mcp->mc_gpregs.gp_t, tf->tf_t, sizeof(mcp->mc_gpregs.gp_t));
 	memcpy(mcp->mc_gpregs.gp_s, tf->tf_s, sizeof(mcp->mc_gpregs.gp_s));
 	memcpy(mcp->mc_gpregs.gp_a, tf->tf_a, sizeof(mcp->mc_gpregs.gp_a));
 
 	if (clear_ret & GET_MC_CLEAR_RET) {
 		mcp->mc_gpregs.gp_a[0] = 0;
 		mcp->mc_gpregs.gp_t[0] = 0; /* clear syscall error */
 	}
 
 	mcp->mc_gpregs.gp_ra = tf->tf_ra;
 	mcp->mc_gpregs.gp_sp = tf->tf_sp;
 	mcp->mc_gpregs.gp_gp = tf->tf_gp;
 	mcp->mc_gpregs.gp_tp = tf->tf_tp;
 	mcp->mc_gpregs.gp_sepc = tf->tf_sepc;
 	mcp->mc_gpregs.gp_sstatus = tf->tf_sstatus;
 
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 
 	memcpy(tf->tf_t, mcp->mc_gpregs.gp_t, sizeof(tf->tf_t));
 	memcpy(tf->tf_s, mcp->mc_gpregs.gp_s, sizeof(tf->tf_s));
 	memcpy(tf->tf_a, mcp->mc_gpregs.gp_a, sizeof(tf->tf_a));
 
 	tf->tf_ra = mcp->mc_gpregs.gp_ra;
 	tf->tf_sp = mcp->mc_gpregs.gp_sp;
 	tf->tf_gp = mcp->mc_gpregs.gp_gp;
 	tf->tf_tp = mcp->mc_gpregs.gp_tp;
 	tf->tf_sepc = mcp->mc_gpregs.gp_sepc;
 	tf->tf_sstatus = mcp->mc_gpregs.gp_sstatus;
 
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifdef FPE
 	struct pcb *curpcb;
 
 	critical_enter();
 
 	curpcb = curthread->td_pcb;
 
 	KASSERT(td->td_pcb == curpcb, ("Invalid fpe pcb"));
 
 	if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
 		/*
 		 * If we have just been running FPE instructions we will
 		 * need to save the state to memcpy it below.
 		 */
 		fpe_state_save(td);
 
 		KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0,
 		    ("Non-userspace FPE flags set in get_fpcontext"));
 		memcpy(mcp->mc_fpregs.fp_x, curpcb->pcb_x,
 		    sizeof(mcp->mc_fpregs));
 		mcp->mc_fpregs.fp_fcsr = curpcb->pcb_fcsr;
 		mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags;
 		mcp->mc_flags |= _MC_FP_VALID;
 	}
 
 	critical_exit();
 #endif
 }
 
 static void
 set_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifdef FPE
 	struct pcb *curpcb;
 
 	critical_enter();
 
 	if ((mcp->mc_flags & _MC_FP_VALID) != 0) {
 		curpcb = curthread->td_pcb;
 		/* FPE usage is enabled, override registers. */
 		memcpy(curpcb->pcb_x, mcp->mc_fpregs.fp_x,
 		    sizeof(mcp->mc_fpregs));
 		curpcb->pcb_fcsr = mcp->mc_fpregs.fp_fcsr;
 		curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK;
 	}
 
 	critical_exit();
 #endif
 }
 
 void
 cpu_idle(int busy)
 {
 
 	spinlock_enter();
 	if (!busy)
 		cpu_idleclock();
 	if (!sched_runnable())
 		__asm __volatile(
 		    "fence \n"
 		    "wfi   \n");
 	if (!busy)
 		cpu_activeclock();
 	spinlock_exit();
 }
 
 void
 cpu_halt(void)
 {
 
 	panic("cpu_halt");
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 
 	/* TBD */
 }
 
 /* Get current clock frequency for the given CPU ID. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	panic("cpu_est_clockrate");
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_sstatus_ie = intr_disable();
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t sstatus_ie;
 
 	td = curthread;
 	critical_exit();
 	sstatus_ie = td->td_md.md_saved_sstatus_ie;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(sstatus_ie);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 int
 sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	uint64_t sstatus;
 	ucontext_t uc;
 	int error;
 
 	if (uap == NULL)
 		return (EFAULT);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
 		return (EFAULT);
 
 	/*
 	 * Make sure the processor mode has not been tampered with and
 	 * interrupts have not been disabled.
 	 * Supervisor interrupts in user mode are always enabled.
 	 */
 	sstatus = uc.uc_mcontext.mc_gpregs.gp_sstatus;
 	if ((sstatus & SSTATUS_SPP) != 0)
 		return (EINVAL);
 
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	set_fpcontext(td, &uc.uc_mcontext);
 
 	/* Restore signal mask. */
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	memcpy(pcb->pcb_t, tf->tf_t, sizeof(tf->tf_t));
 	memcpy(pcb->pcb_s, tf->tf_s, sizeof(tf->tf_s));
 	memcpy(pcb->pcb_a, tf->tf_a, sizeof(tf->tf_a));
 
 	pcb->pcb_ra = tf->tf_ra;
 	pcb->pcb_sp = tf->tf_sp;
 	pcb->pcb_gp = tf->tf_gp;
 	pcb->pcb_tp = tf->tf_tp;
 	pcb->pcb_sepc = tf->tf_sepc;
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe *fp, frame;
 	struct sysentvec *sysent;
 	struct trapframe *tf;
 	struct sigacts *psp;
 	struct thread *td;
 	struct proc *p;
 	int onstack;
 	int code;
 	int sig;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 
 	tf = td->td_frame;
 	onstack = sigonstack(tf->tf_sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size);
 	} else {
 		fp = (struct sigframe *)td->td_frame->tf_sp;
 	}
 
 	/* Make room, keeping the stack aligned */
 	fp--;
 	fp = (struct sigframe *)STACKALIGN(fp);
 
 	/* Fill in the frame to copy out */
 	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
 	get_fpcontext(td, &frame.sf_uc.uc_mcontext);
 	frame.sf_si = ksi->ksi_info;
 	frame.sf_uc.uc_sigmask = *mask;
 	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
 	    ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	frame.sf_uc.uc_stack = td->td_sigstk;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(td->td_proc);
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
 		/* Process has trashed its stack. Kill it. */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	tf->tf_a[0] = sig;
 	tf->tf_a[1] = (register_t)&fp->sf_si;
 	tf->tf_a[2] = (register_t)&fp->sf_uc;
 
 	tf->tf_sepc = (register_t)catcher;
 	tf->tf_sp = (register_t)fp;
 
 	sysent = p->p_sysent;
 	if (sysent->sv_sigcode_base != 0)
 		tf->tf_ra = (register_t)sysent->sv_sigcode_base;
 	else
 		tf->tf_ra = (register_t)(sysent->sv_psstrings -
 		    *(sysent->sv_szsigcode));
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_sepc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 static void
 init_proc0(vm_offset_t kstack)
 {
 
 	pcpup = &__pcpu[0];
 
 	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kstack;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1;
 	thread0.td_pcb->pcb_fpflags = 0;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 }
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     u_int *physmap_idxp)
 {
 	u_int i, insert_idx, _physmap_idx;
 
 	_physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = _physmap_idx;
 	for (i = 0; i <= _physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= _physmap_idx &&
 	    base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	_physmap_idx += 2;
 	*physmap_idxp = _physmap_idx;
 	if (_physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = _physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 
 	printf("physmap[%d] = 0x%016lx\n", insert_idx, base);
 	printf("physmap[%d] = 0x%016lx\n", insert_idx + 1, base + length);
 	return (1);
 }
 
 #ifdef FDT
 static void
 try_load_dtb(caddr_t kmdp, vm_offset_t dtbp)
 {
 
 #if defined(FDT_DTB_STATIC)
 	dtbp = (vm_offset_t)&fdt_static_dtb;
 #endif
 
 	if (dtbp == (vm_offset_t)NULL) {
 		printf("ERROR loading DTB\n");
 		return;
 	}
 
 	if (OF_install(OFW_FDT, 0) == FALSE)
 		panic("Cannot install FDT");
 
 	if (OF_init((void *)dtbp) != 0)
 		panic("OF_init failed with the found device tree");
 }
 #endif
 
 static void
 cache_setup(void)
 {
 
 	/* TODO */
 }
 
 /*
  * Fake up a boot descriptor table.
  * RISCVTODO: This needs to be done via loader (when it's available).
  */
 vm_offset_t
 fake_preload_metadata(struct riscv_bootparams *rvbp __unused)
 {
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
 	vm_offset_t lastaddr;
 	int i = 0;
 	static uint32_t fake_preload[35];
 
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "kernel");
 	i += 1;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf64 kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf64 kernel");
 	i += 3;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = (uint64_t)(KERNBASE + KERNENTRY);
 	i += 1;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint64_t);
 	printf("end is 0x%016lx\n", (uint64_t)&end);
 	fake_preload[i++] = (uint64_t)&end - (uint64_t)(KERNBASE + KERNENTRY);
 	i += 1;
 #ifdef DDB
 #if 0
 	/* RISCVTODO */
 	if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
 		lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
 		zend = lastaddr;
 		zstart = *(uint32_t *)(KERNVIRTADDR + 4);
 		db_fetch_ksymtab(zstart, zend);
 	} else
 #endif
 #endif
 		lastaddr = (vm_offset_t)&end;
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 	return (lastaddr);
 }
 
 void
 initriscv(struct riscv_bootparams *rvbp)
 {
 	struct mem_region mem_regions[FDT_MEM_REGIONS];
 	vm_offset_t rstart, rend;
 	vm_offset_t s, e;
 	int mem_regions_sz;
 	vm_offset_t lastaddr;
 	vm_size_t kernlen;
 	caddr_t kmdp;
 	int i;
 
 	/* Set the module data location */
 	lastaddr = fake_preload_metadata(rvbp);
 
 	/* Find the kernel address */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 
 	boothowto = RB_VERBOSE | RB_SINGLE;
 	boothowto = RB_VERBOSE;
 
 	kern_envp = NULL;
 
 #ifdef FDT
 	try_load_dtb(kmdp, rvbp->dtbp_virt);
 #endif
 
 	/* Load the physical memory ranges */
 	physmap_idx = 0;
 
 #ifdef FDT
 	/* Grab physical memory regions information from device tree. */
 	if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, NULL) != 0)
 		panic("Cannot get physical memory regions");
 
 	s = rvbp->dtbp_phys;
 	e = s + DTB_SIZE_MAX;
 
 	for (i = 0; i < mem_regions_sz; i++) {
 		rstart = mem_regions[i].mr_start;
 		rend = (mem_regions[i].mr_start + mem_regions[i].mr_size);
 
 		if ((rstart < s) && (rend > e)) {
 			/* Exclude DTB region. */
 			add_physmap_entry(rstart, (s - rstart), physmap, &physmap_idx);
 			add_physmap_entry(e, (rend - e), physmap, &physmap_idx);
 		} else {
 			add_physmap_entry(mem_regions[i].mr_start,
 			    mem_regions[i].mr_size, physmap, &physmap_idx);
 		}
 	}
 #endif
 
 	/* Set the pcpu data, this is needed by pmap_bootstrap */
 	pcpup = &__pcpu[0];
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 
 	/* Set the pcpu pointer */
 	__asm __volatile("mv gp, %0" :: "r"(pcpup));
 
 	PCPU_SET(curthread, &thread0);
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 
 	cache_setup();
 
 	/* Bootstrap enough of pmap to enter the kernel proper */
 	kernlen = (lastaddr - KERNBASE);
 	pmap_bootstrap(rvbp->kern_l1pt, mem_regions[0].mr_start, kernlen);
 
 	cninit();
 
 	init_proc0(rvbp->kern_stack);
 
 	/* set page table base register for thread0 */
 	thread0.td_pcb->pcb_l1addr = \
 	    (rvbp->kern_l1pt - KERNBASE + rvbp->kern_phys);
 
 	msgbufinit(msgbufp, msgbufsize);
 	mutex_init();
 	init_param2(physmem);
 	kdb_init();
 
 	riscv_init_interrupts();
 
 	early_boot = 0;
 }
 
 #undef bzero
 void
 bzero(void *buf, size_t len)
 {
 	uint8_t *p;
 
 	p = buf;
 	while(len-- > 0)
 		*p++ = 0;
 }
Index: projects/bsd_rdma_4_9/sys/sparc64/sparc64/machdep.c
===================================================================
--- projects/bsd_rdma_4_9/sys/sparc64/sparc64/machdep.c	(revision 326161)
+++ projects/bsd_rdma_4_9/sys/sparc64/sparc64/machdep.c	(revision 326162)
@@ -1,1116 +1,1113 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001 Jake Burkholder.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  *	from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timetc.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/bus.h>
 #include <machine/cache.h>
 #include <machine/cmt.h>
 #include <machine/cpu.h>
 #include <machine/fireplane.h>
 #include <machine/fp.h>
 #include <machine/fsr.h>
 #include <machine/intr_machdep.h>
 #include <machine/jbus.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_machdep.h>
 #include <machine/ofw_mem.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/pstate.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 #include <machine/upa.h>
 #include <machine/ver.h>
 
 typedef int ofw_vec_t(void *);
 
 int dtlb_slots;
 int itlb_slots;
 struct tlb_entry *kernel_tlbs;
 int kernel_tlb_slots;
 
 int cold = 1;
 long Maxmem;
 long realmem;
 
 void *dpcpu0;
 char pcpu0[PCPU_PAGES * PAGE_SIZE];
 struct pcpu dummy_pcpu[MAXCPU];
 struct trapframe frame0;
 
 vm_offset_t kstack0;
 vm_paddr_t kstack0_phys;
 
 struct kva_md_info kmi;
 
 u_long ofw_vec;
 u_long ofw_tba;
 u_int tba_taken_over;
 
 char sparc64_model[32];
 
 static int cpu_use_vis = 1;
 
 cpu_block_copy_t *cpu_block_copy;
 cpu_block_zero_t *cpu_block_zero;
 
 static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl);
 void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
     ofw_vec_t *vec);
 static void sparc64_shutdown_final(void *dummy, int howto);
 
 static void cpu_startup(void *arg);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 CTASSERT((1 << INT_SHIFT) == sizeof(int));
 CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
 
 CTASSERT(sizeof(struct reg) == 256);
 CTASSERT(sizeof(struct fpreg) == 272);
 CTASSERT(sizeof(struct __mcontext) == 512);
 
 CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
 CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
 
 CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
 
 static void
 cpu_startup(void *arg)
 {
 	vm_paddr_t physsz;
 	int i;
 
 	physsz = 0;
 	for (i = 0; i < sparc64_nmemreg; i++)
 		physsz += sparc64_memreg[i].mr_size;
 	printf("real memory  = %lu (%lu MB)\n", physsz,
 	    physsz / (1024 * 1024));
 	realmem = (long)physsz / PAGE_SIZE;
 
 	vm_ksubmap_init(&kmi);
 
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
 	    SHUTDOWN_PRI_LAST);
 
 	printf("avail memory = %lu (%lu MB)\n", vm_cnt.v_free_count * PAGE_SIZE,
 	    vm_cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
 
 	if (bootverbose)
 		printf("machine: %s\n", sparc64_model);
 
 	cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 	struct intr_request *ir;
 	int i;
 
 	pcpu->pc_irtail = &pcpu->pc_irhead;
 	for (i = 0; i < IR_FREE; i++) {
 		ir = &pcpu->pc_irpool[i];
 		ir->ir_next = pcpu->pc_irfree;
 		pcpu->pc_irfree = ir;
 	}
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		pil = rdpr(pil);
 		wrpr(pil, 0, PIL_TICK);
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_pil = pil;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	critical_exit();
 	pil = td->td_md.md_saved_pil;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		wrpr(pil, pil, 0);
 }
 
 static phandle_t
 find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl)
 {
 	char type[sizeof("cpu")];
 	phandle_t child;
 	uint32_t portid;
 
 	for (; node != 0; node = OF_peer(node)) {
 		child = OF_child(node);
 		if (child > 0) {
 			child = find_bsp(child, bspid, cpu_impl);
 			if (child > 0)
 				return (child);
 		} else {
 			if (OF_getprop(node, "device_type", type,
 			    sizeof(type)) <= 0)
 				continue;
 			if (strcmp(type, "cpu") != 0)
 				continue;
 			if (OF_getprop(node, cpu_portid_prop(cpu_impl),
 			    &portid, sizeof(portid)) <= 0)
 				continue;
 			if (portid == bspid)
 				return (node);
 		}
 	}
 	return (0);
 }
 
 const char *
 cpu_portid_prop(u_int cpu_impl)
 {
 
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_SPARC64V:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		return ("upa-portid");
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 		return ("portid");
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		return ("cpuid");
 	default:
 		return ("");
 	}
 }
 
 uint32_t
 cpu_get_mid(u_int cpu_impl)
 {
 
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_SPARC64V:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 		return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG,
 		    ASI_FIREPLANE_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 		return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID)));
 	default:
 		return (0);
 	}
 }
 
 void
 sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 {
 	char *env;
 	struct pcpu *pc;
 	vm_offset_t end;
 	vm_offset_t va;
 	caddr_t kmdp;
 	phandle_t root;
 	u_int cpu_impl;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Find out what kind of CPU we have first, for anything that changes
 	 * behaviour.
 	 */
 	cpu_impl = VER_IMPL(rdpr(ver));
 
 	/*
 	 * Do CPU-specific initialization.
 	 */
 	if (cpu_impl >= CPU_IMPL_ULTRASPARCIII)
 		cheetah_init(cpu_impl);
 	else if (cpu_impl == CPU_IMPL_SPARC64V)
 		zeus_init(cpu_impl);
 
 	/*
 	 * Clear (S)TICK timer (including NPT).
 	 */
 	tick_clear(cpu_impl);
 
 	/*
 	 * UltraSparc II[e,i] based systems come up with the tick interrupt
 	 * enabled and a handler that resets the tick counter, causing DELAY()
 	 * to not work properly when used early in boot.
 	 * UltraSPARC III based systems come up with the system tick interrupt
 	 * enabled, causing an interrupt storm on startup since they are not
 	 * handled.
 	 */
 	tick_stop(cpu_impl);
 
 	/*
 	 * Set up Open Firmware entry points.
 	 */
 	ofw_tba = rdpr(tba);
 	ofw_vec = (u_long)vec;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be before the
 	 * console is inited so cninit() gets the right value of boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *),
 			    0);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 			kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
 			    int);
 			kernel_tlbs = (void *)preload_search_info(kmdp,
 			    MODINFO_METADATA | MODINFOMD_DTLB);
 		}
 	}
 
 	init_param1();
 
 	/*
 	 * Initialize Open Firmware (needed for console).
 	 */
 	OF_install(OFW_STD_DIRECT, 0);
 	OF_init(ofw_entry);
 
 	/*
 	 * Prime our per-CPU data page for use.  Note, we are using it for
 	 * our stack, so don't pass the real size (PAGE_SIZE) to pcpu_init
 	 * or it'll zero it out from under us.
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_addr = (vm_offset_t)pcpu0;
 	pc->pc_impl = cpu_impl;
 	pc->pc_mid = cpu_get_mid(cpu_impl);
 	pc->pc_tlb_ctx = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX;
 
 	/*
 	 * Determine the OFW node and frequency of the BSP (and ensure the
 	 * BSP is in the device tree in the first place).
 	 */
 	root = OF_peer(0);
 	pc->pc_node = find_bsp(root, pc->pc_mid, cpu_impl);
 	if (pc->pc_node == 0)
 		OF_panic("%s: cannot find boot CPU node", __func__);
 	if (OF_getprop(pc->pc_node, "clock-frequency", &pc->pc_clock,
 	    sizeof(pc->pc_clock)) <= 0)
 		OF_panic("%s: cannot determine boot CPU clock", __func__);
 
 	/*
 	 * Panic if there is no metadata.  Most likely the kernel was booted
 	 * directly, instead of through loader(8).
 	 */
 	if (mdp == NULL || kmdp == NULL || end == 0 ||
 	    kernel_tlb_slots == 0 || kernel_tlbs == NULL)
 		OF_panic("%s: missing loader metadata.\nThis probably means "
 		    "you are not using loader(8).", __func__);
 
 	/*
 	 * Work around the broken loader behavior of not demapping no
 	 * longer used kernel TLB slots when unloading the kernel or
 	 * modules.
 	 */
 	for (va = KERNBASE + (kernel_tlb_slots - 1) * PAGE_SIZE_4M;
 	    va >= roundup2(end, PAGE_SIZE_4M); va -= PAGE_SIZE_4M) {
 		if (bootverbose)
 			OF_printf("demapping unused kernel TLB slot "
 			    "(va %#lx - %#lx)\n", va, va + PAGE_SIZE_4M - 1);
 		stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE,
 		    ASI_DMMU_DEMAP, 0);
 		stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE,
 		    ASI_IMMU_DEMAP, 0);
 		flush(KERNBASE);
 		kernel_tlb_slots--;
 	}
 
 	/*
 	 * Determine the TLB slot maxima, which are expected to be
 	 * equal across all CPUs.
 	 * NB: for cheetah-class CPUs, these properties only refer
 	 * to the t16s.
 	 */
 	if (OF_getprop(pc->pc_node, "#dtlb-entries", &dtlb_slots,
 	    sizeof(dtlb_slots)) == -1)
 		OF_panic("%s: cannot determine number of dTLB slots",
 		    __func__);
 	if (OF_getprop(pc->pc_node, "#itlb-entries", &itlb_slots,
 	    sizeof(itlb_slots)) == -1)
 		OF_panic("%s: cannot determine number of iTLB slots",
 		    __func__);
 
 	/*
 	 * Initialize and enable the caches.  Note that this may include
 	 * applying workarounds.
 	 */
 	cache_init(pc);
 	cache_enable(cpu_impl);
 	uma_set_align(pc->pc_cache.dc_linesize - 1);
 
 	cpu_block_copy = bcopy;
 	cpu_block_zero = bzero;
 	getenv_int("machdep.use_vis", &cpu_use_vis);
 	if (cpu_use_vis) {
 		switch (cpu_impl) {
 		case CPU_IMPL_SPARC64:
 		case CPU_IMPL_ULTRASPARCI:
 		case CPU_IMPL_ULTRASPARCII:
 		case CPU_IMPL_ULTRASPARCIIi:
 		case CPU_IMPL_ULTRASPARCIIe:
 		case CPU_IMPL_ULTRASPARCIII:	/* NB: we've disabled P$. */
 		case CPU_IMPL_ULTRASPARCIIIp:
 		case CPU_IMPL_ULTRASPARCIIIi:
 		case CPU_IMPL_ULTRASPARCIV:
 		case CPU_IMPL_ULTRASPARCIVp:
 		case CPU_IMPL_ULTRASPARCIIIip:
 			cpu_block_copy = spitfire_block_copy;
 			cpu_block_zero = spitfire_block_zero;
 			break;
 		case CPU_IMPL_SPARC64V:
 			cpu_block_copy = zeus_block_copy;
 			cpu_block_zero = zeus_block_zero;
 			break;
 		}
 	}
 
 #ifdef SMP
 	mp_init();
 #endif
 
 	/*
 	 * Initialize virtual memory and calculate physmem.
 	 */
 	pmap_bootstrap(cpu_impl);
 
 	/*
 	 * Initialize tunables.
 	 */
 	init_param2(physmem);
 	env = kern_getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Initialize the interrupt tables.
 	 */
 	intr_init1();
 
 	/*
 	 * Initialize proc0, set kstack0, frame0, curthread and curpcb.
 	 */
 	proc_linkup0(&proc0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	thread0.td_kstack = kstack0;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
 	thread0.td_frame = &frame0;
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 
 	/*
 	 * Initialize global registers.
 	 */
 	cpu_setregs(pc);
 
 	/*
 	 * Take over the trap table via the PROM.  Using the PROM for this
 	 * is necessary in order to set obp-control-relinquished to true
 	 * within the PROM so obtaining /virtual-memory/translations doesn't
 	 * trigger a fatal reset error or worse things further down the road.
 	 * XXX it should be possible to use this solely instead of writing
 	 * %tba in cpu_setregs().  Doing so causes a hang however.
 	 *
 	 * NB: the low-level console drivers require a working DELAY() and
 	 * some compiler optimizations may cause the curthread accesses of
 	 * mutex(9) to be factored out even if the latter aren't actually
 	 * called.  Both of these require PCPU_REG to be set.  However, we
 	 * can't set PCPU_REG without also taking over the trap table or the
 	 * firmware will overwrite it.
 	 */
 	sun4u_set_traptable(tl0_base);
 
 	/*
 	 * Initialize the dynamic per-CPU area for the BSP and the message
 	 * buffer (after setting the trap table).
 	 */
 	dpcpu_init(dpcpu0, 0);
 	msgbufinit(msgbufp, msgbufsize);
 
 	/*
 	 * Initialize mutexes.
 	 */
 	mutex_init();
 
 	/*
 	 * Initialize console now that we have a reasonable set of system
 	 * services.
 	 */
 	cninit();
 
 	/*
 	 * Finish the interrupt initialization now that mutexes work and
 	 * enable them.
 	 */
 	intr_init2();
 	wrpr(pil, 0, 0);
 	wrpr(pstate, 0, PSTATE_KERNEL);
 
 	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct frame *fp;
 	struct proc *p;
 	u_long sp;
 	int oonstack;
 	int sig;
 
 	oonstack = 0;
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_sp + SPOFF;
 	oonstack = sigonstack(sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Make sure we have a signal trampoline to return to. */
 	if (p->p_md.md_sigtramp == NULL) {
 		/*
 		 * No signal trampoline... kill the process.
 		 */
 		CTR0(KTR_SIG, "sendsig: no sigtramp");
 		printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
 	    ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe));
 	} else
 		sfp = (struct sigframe *)sp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	fp = (struct frame *)sfp - 1;
 
 	/* Build the argument list for the signal handler. */
 	tf->tf_out[0] = sig;
 	tf->tf_out[2] = (register_t)&sfp->sf_uc;
 	tf->tf_out[4] = (register_t)catcher;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		tf->tf_out[1] = (register_t)&sfp->sf_si;
 
 		/* Fill in POSIX parts. */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->tf_out[1] = ksi->ksi_code;
 		tf->tf_out[3] = (register_t)ksi->ksi_addr;
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		/* NOTREACHED */
 	}
 
 	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
 	tf->tf_tnpc = tf->tf_tpc + 4;
 	tf->tf_sp = (u_long)fp - SPOFF;
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	mcontext_t *mc;
 	ucontext_t uc;
 	int error;
 
 	p = td->td_proc;
 	if (rwindow_save(td)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	mc = &uc.uc_mcontext;
 	error = set_mcontext(td, mc);
 	if (error != 0)
 		return (error);
 
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
 	    td, mc->_mc_tpc, mc->_mc_sp, mc->_mc_tstate);
 	return (EJUSTRETURN);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_pc = tf->tf_tpc;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/*
 	 * Copy the registers which will be restored by tl0_ret() from the
 	 * trapframe.
 	 * Note that we skip %g7 which is used as the userland TLS register
 	 * and %wstate.
 	 */
 	mc->_mc_flags = _MC_VERSION;
 	mc->mc_global[1] = tf->tf_global[1];
 	mc->mc_global[2] = tf->tf_global[2];
 	mc->mc_global[3] = tf->tf_global[3];
 	mc->mc_global[4] = tf->tf_global[4];
 	mc->mc_global[5] = tf->tf_global[5];
 	mc->mc_global[6] = tf->tf_global[6];
 	if (flags & GET_MC_CLEAR_RET) {
 		mc->mc_out[0] = 0;
 		mc->mc_out[1] = 0;
 	} else {
 		mc->mc_out[0] = tf->tf_out[0];
 		mc->mc_out[1] = tf->tf_out[1];
 	}
 	mc->mc_out[2] = tf->tf_out[2];
 	mc->mc_out[3] = tf->tf_out[3];
 	mc->mc_out[4] = tf->tf_out[4];
 	mc->mc_out[5] = tf->tf_out[5];
 	mc->mc_out[6] = tf->tf_out[6];
 	mc->mc_out[7] = tf->tf_out[7];
 	mc->_mc_fprs = tf->tf_fprs;
 	mc->_mc_fsr = tf->tf_fsr;
 	mc->_mc_gsr = tf->tf_gsr;
 	mc->_mc_tnpc = tf->tf_tnpc;
 	mc->_mc_tpc = tf->tf_tpc;
 	mc->_mc_tstate = tf->tf_tstate;
 	mc->_mc_y = tf->tf_y;
 	critical_enter();
 	if ((tf->tf_fprs & FPRS_FEF) != 0) {
 		savefpctx(pcb->pcb_ufp);
 		tf->tf_fprs &= ~FPRS_FEF;
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	if ((pcb->pcb_flags & PCB_FEF) != 0) {
 		bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
 		mc->_mc_fprs |= FPRS_FEF;
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, mcontext_t *mc)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	if (!TSTATE_SECURE(mc->_mc_tstate) ||
 	    (mc->_mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
 		return (EINVAL);
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/* Make sure the windows are spilled first. */
 	flushw();
 	/*
 	 * Copy the registers which will be restored by tl0_ret() to the
 	 * trapframe.
 	 * Note that we skip %g7 which is used as the userland TLS register
 	 * and %wstate.
 	 */
 	tf->tf_global[1] = mc->mc_global[1];
 	tf->tf_global[2] = mc->mc_global[2];
 	tf->tf_global[3] = mc->mc_global[3];
 	tf->tf_global[4] = mc->mc_global[4];
 	tf->tf_global[5] = mc->mc_global[5];
 	tf->tf_global[6] = mc->mc_global[6];
 	tf->tf_out[0] = mc->mc_out[0];
 	tf->tf_out[1] = mc->mc_out[1];
 	tf->tf_out[2] = mc->mc_out[2];
 	tf->tf_out[3] = mc->mc_out[3];
 	tf->tf_out[4] = mc->mc_out[4];
 	tf->tf_out[5] = mc->mc_out[5];
 	tf->tf_out[6] = mc->mc_out[6];
 	tf->tf_out[7] = mc->mc_out[7];
 	tf->tf_fprs = mc->_mc_fprs;
 	tf->tf_fsr = mc->_mc_fsr;
 	tf->tf_gsr = mc->_mc_gsr;
 	tf->tf_tnpc = mc->_mc_tnpc;
 	tf->tf_tpc = mc->_mc_tpc;
 	tf->tf_tstate = mc->_mc_tstate;
 	tf->tf_y = mc->_mc_y;
 	if ((mc->_mc_fprs & FPRS_FEF) != 0) {
 		tf->tf_fprs = 0;
 		bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	return (0);
 }
 
 /*
  * Exit the kernel and execute a firmware call that will not return, as
  * specified by the arguments.
  */
 void
 cpu_shutdown(void *args)
 {
 
 #ifdef SMP
 	cpu_mp_shutdown();
 #endif
 	ofw_exit(args);
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 
 	/* TBD */
 }
 
 /* Get current clock frequency for the given CPU ID. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	struct pcpu *pc;
 
 	pc = pcpu_find(cpu_id);
 	if (pc == NULL || rate == NULL)
 		return (EINVAL);
 	*rate = pc->pc_clock;
 	return (0);
 }
 
 /*
  * Duplicate OF_exit() with a different firmware call function that restores
  * the trap table, otherwise a RED state exception is triggered in at least
  * some firmware versions.
  */
 void
 cpu_halt(void)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"exit",
 		0,
 		0
 	};
 
 	cpu_shutdown(&args);
 }
 
 static void
 sparc64_shutdown_final(void *dummy, int howto)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"SUNW,power-off",
 		0,
 		0
 	};
 
 	/* Turn the power off? */
 	if ((howto & RB_POWEROFF) != 0)
 		cpu_shutdown(&args);
 	/* In case of halt, return to the firmware. */
 	if ((howto & RB_HALT) != 0)
 		cpu_halt();
 }
 
 void
 cpu_idle(int busy)
 {
 
 	/* Insert code to halt (until next interrupt) for the idle loop. */
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 
 	return (1);
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_tpc = addr;
 	td->td_frame->tf_tnpc = addr + 4;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	struct proc *p;
 	u_long sp;
 
 	/* XXX no cpu_exec */
 	p = td->td_proc;
 	p->p_md.md_sigtramp = NULL;
 	if (p->p_md.md_utrap != NULL) {
 		utrap_free(p->p_md.md_utrap);
 		p->p_md.md_utrap = NULL;
 	}
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	sp = rounddown(stack, 16);
 	bzero(pcb, sizeof(*pcb));
 	bzero(tf, sizeof(*tf));
 	tf->tf_out[0] = stack;
 	tf->tf_out[3] = p->p_sysent->sv_psstrings;
 	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
 	tf->tf_tnpc = imgp->entry_addr + 4;
 	tf->tf_tpc = imgp->entry_addr;
 	/*
 	 * While we could adhere to the memory model indicated in the ELF
 	 * header, it turns out that just always using TSO performs best.
 	 */
 	tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
-
-	td->td_retval[0] = tf->tf_out[0];
-	td->td_retval[1] = tf->tf_out[1];
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 
 	bcopy(td->td_frame, regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	if (!TSTATE_SECURE(regs->r_tstate))
 		return (EINVAL);
 	tf = td->td_frame;
 	regs->r_wstate = tf->tf_wstate;
 	bcopy(regs, tf, sizeof(*regs));
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
 	fpregs->fr_fsr = tf->tf_fsr;
 	fpregs->fr_gsr = tf->tf_gsr;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs &= ~FPRS_FEF;
 	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 	tf->tf_fsr = fpregs->fr_fsr;
 	tf->tf_gsr = fpregs->fr_gsr;
 	return (0);
 }
 
 struct md_utrap *
 utrap_alloc(void)
 {
 	struct md_utrap *ut;
 
 	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
 	ut->ut_refcnt = 1;
 	return (ut);
 }
 
 void
 utrap_free(struct md_utrap *ut)
 {
 	int refcnt;
 
 	if (ut == NULL)
 		return;
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt--;
 	refcnt = ut->ut_refcnt;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	if (refcnt == 0)
 		free(ut, M_SUBPROC);
 }
 
 struct md_utrap *
 utrap_hold(struct md_utrap *ut)
 {
 
 	if (ut == NULL)
 		return (NULL);
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt++;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	return (ut);
 }
Index: projects/bsd_rdma_4_9/usr.bin/vmstat/vmstat.c
===================================================================
--- projects/bsd_rdma_4_9/usr.bin/vmstat/vmstat.c	(revision 326161)
+++ projects/bsd_rdma_4_9/usr.bin/vmstat/vmstat.c	(revision 326162)
@@ -1,1738 +1,1738 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1980, 1986, 1991, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)vmstat.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/signal.h>
 #include <sys/fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/user.h>
 #define	_WANT_VMMETER
 #include <sys/vmmeter.h>
 #include <sys/pcpu.h>
 
 #include <vm/vm_param.h>
 
 #include <ctype.h>
 #include <devstat.h>
 #include <err.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <kvm.h>
 #include <limits.h>
 #include <memstat.h>
 #include <nlist.h>
 #include <paths.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysexits.h>
 #include <time.h>
 #include <unistd.h>
 #include <libutil.h>
 #include <libxo/xo.h>
 
 #define VMSTAT_XO_VERSION "1"
 
 static char da[] = "da";
 
 static struct nlist namelist[] = {
 #define X_SUM		0
 	{ "_vm_cnt" },
 #define X_HZ		1
 	{ "_hz" },
 #define X_STATHZ	2
 	{ "_stathz" },
 #define X_NCHSTATS	3
 	{ "_nchstats" },
 #define	X_INTRNAMES	4
 	{ "_intrnames" },
 #define	X_SINTRNAMES	5
 	{ "_sintrnames" },
 #define	X_INTRCNT	6
 	{ "_intrcnt" },
 #define	X_SINTRCNT	7
 	{ "_sintrcnt" },
 #ifdef notyet
 #define	X_DEFICIT	XXX
 	{ "_deficit" },
 #define X_REC		XXX
 	{ "_rectime" },
 #define X_PGIN		XXX
 	{ "_pgintime" },
 #define	X_XSTATS	XXX
 	{ "_xstats" },
 #define X_END		XXX
 #else
 #define X_END		8
 #endif
 	{ "" },
 };
 
 static struct statinfo cur, last;
 static int num_devices, maxshowdevs;
 static long generation;
 static struct device_selection *dev_select;
 static int num_selected;
 static struct devstat_match *matches;
 static int num_matches = 0;
 static int num_devices_specified, num_selections;
 static long select_generation;
 static char **specified_devices;
 static devstat_select_mode select_mode;
 
 static struct __vmmeter {
 	uint64_t v_swtch;
 	uint64_t v_trap;
 	uint64_t v_syscall;
 	uint64_t v_intr;
 	uint64_t v_soft;
 	uint64_t v_vm_faults;
 	uint64_t v_io_faults;
 	uint64_t v_cow_faults;
 	uint64_t v_cow_optim;
 	uint64_t v_zfod;
 	uint64_t v_ozfod;
 	uint64_t v_swapin;
 	uint64_t v_swapout;
 	uint64_t v_swappgsin;
 	uint64_t v_swappgsout;
 	uint64_t v_vnodein;
 	uint64_t v_vnodeout;
 	uint64_t v_vnodepgsin;
 	uint64_t v_vnodepgsout;
 	uint64_t v_intrans;
 	uint64_t v_reactivated;
 	uint64_t v_pdwakeups;
 	uint64_t v_pdpages;
 	uint64_t v_pdshortfalls;
 	uint64_t v_dfree;
 	uint64_t v_pfree;
 	uint64_t v_tfree;
 	uint64_t v_forks;
 	uint64_t v_vforks;
 	uint64_t v_rforks;
 	uint64_t v_kthreads;
 	uint64_t v_forkpages;
 	uint64_t v_vforkpages;
 	uint64_t v_rforkpages;
 	uint64_t v_kthreadpages;
 	u_int v_page_size;
 	u_int v_page_count;
 	u_int v_free_reserved;
 	u_int v_free_target;
 	u_int v_free_min;
 	u_int v_free_count;
 	u_int v_wire_count;
 	u_int v_active_count;
 	u_int v_inactive_target;
 	u_int v_inactive_count;
 	u_int v_laundry_count;
 	u_int v_pageout_free_min;
 	u_int v_interrupt_free_min;
 	u_int v_free_severe;
 } sum, osum;
 
 #define	VMSTAT_DEFAULT_LINES	20	/* Default number of `winlines'. */
 volatile sig_atomic_t wresized;		/* Tty resized, when non-zero. */
 static int winlines = VMSTAT_DEFAULT_LINES; /* Current number of tty rows. */
 
 static int	aflag;
 static int	nflag;
 static int	Pflag;
 static int	hflag;
 
 static kvm_t   *kd;
 
 #define	FORKSTAT	0x01
 #define	INTRSTAT	0x02
 #define	MEMSTAT		0x04
 #define	SUMSTAT		0x08
 #define	TIMESTAT	0x10
 #define	VMSTAT		0x20
 #define ZMEMSTAT	0x40
 #define	OBJSTAT		0x80
 
 static void	cpustats(void);
 static void	pcpustats(int, u_long, int);
 static void	devstats(void);
 static void	doforkst(void);
 static void	dointr(unsigned int, int);
 static void	doobjstat(void);
 static void	dosum(void);
 static void	dovmstat(unsigned int, int);
 static void	domemstat_malloc(void);
 static void	domemstat_zone(void);
 static void	kread(int, void *, size_t);
 static void	kreado(int, void *, size_t, size_t);
 static char    *kgetstr(const char *);
 static void	needhdr(int);
 static void	needresize(int);
 static void	doresize(void);
 static void	printhdr(int, u_long);
 static void	usage(void);
 
 static long	pct(long, long);
 static long long	getuptime(void);
 
 static char   **getdrivedata(char **);
 
 int
 main(int argc, char *argv[])
 {
 	int c, todo;
 	unsigned int interval;
 	float f;
 	int reps;
 	char *memf, *nlistf;
 	char errbuf[_POSIX2_LINE_MAX];
 
 	memf = nlistf = NULL;
 	interval = reps = todo = 0;
 	maxshowdevs = 2;
 	hflag = isatty(1);
 
 	argc = xo_parse_args(argc, argv);
 	if (argc < 0)
 		return argc;
 
 	while ((c = getopt(argc, argv, "ac:fhHiM:mN:n:oPp:stw:z")) != -1) {
 		switch (c) {
 		case 'a':
 			aflag++;
 			break;
 		case 'c':
 			reps = atoi(optarg);
 			break;
 		case 'P':
 			Pflag++;
 			break;
 		case 'f':
 			todo |= FORKSTAT;
 			break;
 		case 'h':
 			hflag = 1;
 			break;
 		case 'H':
 			hflag = 0;
 			break;
 		case 'i':
 			todo |= INTRSTAT;
 			break;
 		case 'M':
 			memf = optarg;
 			break;
 		case 'm':
 			todo |= MEMSTAT;
 			break;
 		case 'N':
 			nlistf = optarg;
 			break;
 		case 'n':
 			nflag = 1;
 			maxshowdevs = atoi(optarg);
 			if (maxshowdevs < 0)
 				xo_errx(1, "number of devices %d is < 0",
 				     maxshowdevs);
 			break;
 		case 'o':
 			todo |= OBJSTAT;
 			break;
 		case 'p':
 			if (devstat_buildmatch(optarg, &matches, &num_matches) != 0)
 				xo_errx(1, "%s", devstat_errbuf);
 			break;
 		case 's':
 			todo |= SUMSTAT;
 			break;
 		case 't':
 #ifdef notyet
 			todo |= TIMESTAT;
 #else
 			xo_errx(EX_USAGE, "sorry, -t is not (re)implemented yet");
 #endif
 			break;
 		case 'w':
 			/* Convert to milliseconds. */
 			f = atof(optarg);
 			interval = f * 1000;
 			break;
 		case 'z':
 			todo |= ZMEMSTAT;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	xo_set_version(VMSTAT_XO_VERSION);
 	if (todo == 0)
 		todo = VMSTAT;
 
 	if (memf != NULL) {
 		kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf);
 		if (kd == NULL)
 			xo_errx(1, "kvm_openfiles: %s", errbuf);
 	}
 
 retry_nlist:
 	if (kd != NULL && (c = kvm_nlist(kd, namelist)) != 0) {
 		if (c > 0) {
 			int bufsize = 0, len = 0;
 			char *buf, *bp;
 			/*
 			 * 'cnt' was renamed to 'vm_cnt'. If 'vm_cnt' is not
 			 * found try looking up older 'cnt' symbol.
 			 * */
 			if (namelist[X_SUM].n_type == 0 &&
 			    strcmp(namelist[X_SUM].n_name, "_vm_cnt") == 0) {
 				namelist[X_SUM].n_name = "_cnt";
 				goto retry_nlist;
 			}
 			for (c = 0; c < (int)(nitems(namelist)); c++)
 				if (namelist[c].n_type == 0)
 					bufsize += strlen(namelist[c].n_name) + 1;
 			bufsize += len + 1;
 			buf = bp = alloca(bufsize);
 
 			for (c = 0; c < (int)(nitems(namelist)); c++)
 				if (namelist[c].n_type == 0) {
 					xo_error(" %s",
 					    namelist[c].n_name);
 					len = strlen(namelist[c].n_name);
 					*bp++ = ' ';
 					memcpy(bp, namelist[c].n_name, len);
 					bp += len;
 				}
 			*bp = '\0';
 			xo_error("undefined symbols:\n", buf);
 		} else
 			xo_warnx("kvm_nlist: %s", kvm_geterr(kd));
 		xo_finish();
 		exit(1);
 	}
 	if (kd && Pflag)
 		xo_errx(1, "Cannot use -P with crash dumps");
 
 	if (todo & VMSTAT) {
 		/*
 		 * Make sure that the userland devstat version matches the
 		 * kernel devstat version.  If not, exit and print a
 		 * message informing the user of his mistake.
 		 */
 		if (devstat_checkversion(NULL) < 0)
 			xo_errx(1, "%s", devstat_errbuf);
 
 
 		argv = getdrivedata(argv);
 	}
 
 	if (*argv) {
 		f = atof(*argv);
 		interval = f * 1000;
 		if (*++argv)
 			reps = atoi(*argv);
 	}
 
 	if (interval) {
 		if (!reps)
 			reps = -1;
 	} else if (reps)
 		interval = 1 * 1000;
 
 	if (todo & FORKSTAT)
 		doforkst();
 	if (todo & MEMSTAT)
 		domemstat_malloc();
 	if (todo & ZMEMSTAT)
 		domemstat_zone();
 	if (todo & SUMSTAT)
 		dosum();
 	if (todo & OBJSTAT)
 		doobjstat();
 #ifdef notyet
 	if (todo & TIMESTAT)
 		dotimes();
 #endif
 	if (todo & INTRSTAT)
 		dointr(interval, reps);
 	if (todo & VMSTAT)
 		dovmstat(interval, reps);
 	xo_finish();
 	exit(0);
 }
 
 static int
 mysysctl(const char *name, void *oldp, size_t *oldlenp)
 {
 	int error;
 
 	error = sysctlbyname(name, oldp, oldlenp, NULL, 0);
 	if (error != 0 && errno != ENOMEM)
 		xo_err(1, "sysctl(%s)", name);
 	return (error);
 }
 
 static char **
 getdrivedata(char **argv)
 {
 	if ((num_devices = devstat_getnumdevs(NULL)) < 0)
 		xo_errx(1, "%s", devstat_errbuf);
 
 	cur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
 	last.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
 
 	if (devstat_getdevs(NULL, &cur) == -1)
 		xo_errx(1, "%s", devstat_errbuf);
 
 	num_devices = cur.dinfo->numdevs;
 	generation = cur.dinfo->generation;
 
 	specified_devices = (char **)malloc(sizeof(char *));
 	for (num_devices_specified = 0; *argv; ++argv) {
 		if (isdigit(**argv))
 			break;
 		num_devices_specified++;
 		specified_devices = (char **)realloc(specified_devices,
 						     sizeof(char *) *
 						     num_devices_specified);
 		specified_devices[num_devices_specified - 1] = *argv;
 	}
 	dev_select = NULL;
 
 	if (nflag == 0 && maxshowdevs < num_devices_specified)
 			maxshowdevs = num_devices_specified;
 
 	/*
 	 * People are generally only interested in disk statistics when
 	 * they're running vmstat.  So, that's what we're going to give
 	 * them if they don't specify anything by default.  We'll also give
 	 * them any other random devices in the system so that we get to
 	 * maxshowdevs devices, if that many devices exist.  If the user
 	 * specifies devices on the command line, either through a pattern
 	 * match or by naming them explicitly, we will give the user only
 	 * those devices.
 	 */
 	if ((num_devices_specified == 0) && (num_matches == 0)) {
 		if (devstat_buildmatch(da, &matches, &num_matches) != 0)
 			xo_errx(1, "%s", devstat_errbuf);
 
 		select_mode = DS_SELECT_ADD;
 	} else
 		select_mode = DS_SELECT_ONLY;
 
 	/*
 	 * At this point, selectdevs will almost surely indicate that the
 	 * device list has changed, so we don't look for return values of 0
 	 * or 1.  If we get back -1, though, there is an error.
 	 */
 	if (devstat_selectdevs(&dev_select, &num_selected, &num_selections,
 		       &select_generation, generation, cur.dinfo->devices,
 		       num_devices, matches, num_matches, specified_devices,
 		       num_devices_specified, select_mode,
 		       maxshowdevs, 0) == -1)
 		xo_errx(1, "%s", devstat_errbuf);
 
 	return(argv);
 }
 
 /* Return system uptime in nanoseconds */
 static long long
 getuptime(void)
 {
 	struct timespec sp;
 
 	(void)clock_gettime(CLOCK_UPTIME, &sp);
 
 	return((long long)sp.tv_sec * 1000000000LL + sp.tv_nsec);
 }
 
 static void
 fill_vmmeter(struct __vmmeter *vmmp)
 {
 	struct pcpu **pcpu;
 	int maxcpu, i;
 
 	if (kd != NULL) {
 		struct vmmeter vm_cnt;
 
 		kread(X_SUM, &vm_cnt, sizeof(vm_cnt));
 #define	GET_COUNTER(name) \
 		vmmp->name = kvm_counter_u64_fetch(kd, (u_long)vm_cnt.name)
 		GET_COUNTER(v_swtch);
 		GET_COUNTER(v_trap);
 		GET_COUNTER(v_syscall);
 		GET_COUNTER(v_intr);
 		GET_COUNTER(v_soft);
 		GET_COUNTER(v_vm_faults);
 		GET_COUNTER(v_io_faults);
 		GET_COUNTER(v_cow_faults);
 		GET_COUNTER(v_cow_optim);
 		GET_COUNTER(v_zfod);
 		GET_COUNTER(v_ozfod);
 		GET_COUNTER(v_swapin);
 		GET_COUNTER(v_swapout);
 		GET_COUNTER(v_swappgsin);
 		GET_COUNTER(v_swappgsout);
 		GET_COUNTER(v_vnodein);
 		GET_COUNTER(v_vnodeout);
 		GET_COUNTER(v_vnodepgsin);
 		GET_COUNTER(v_vnodepgsout);
 		GET_COUNTER(v_intrans);
 		GET_COUNTER(v_tfree);
 		GET_COUNTER(v_forks);
 		GET_COUNTER(v_vforks);
 		GET_COUNTER(v_rforks);
 		GET_COUNTER(v_kthreads);
 		GET_COUNTER(v_forkpages);
 		GET_COUNTER(v_vforkpages);
 		GET_COUNTER(v_rforkpages);
 		GET_COUNTER(v_kthreadpages);
 #undef GET_COUNTER
 	} else {
 		size_t size;
 
 #define GET_VM_STATS(cat, name)	do {					\
 	size = sizeof(vmmp->name);					\
 	mysysctl("vm.stats." #cat "." #name, &vmmp->name, &size);	\
 } while (0)
 		/* sys */
 		GET_VM_STATS(sys, v_swtch);
 		GET_VM_STATS(sys, v_trap);
 		GET_VM_STATS(sys, v_syscall);
 		GET_VM_STATS(sys, v_intr);
 		GET_VM_STATS(sys, v_soft);
 
 		/* vm */
 		GET_VM_STATS(vm, v_vm_faults);
 		GET_VM_STATS(vm, v_io_faults);
 		GET_VM_STATS(vm, v_cow_faults);
 		GET_VM_STATS(vm, v_cow_optim);
 		GET_VM_STATS(vm, v_zfod);
 		GET_VM_STATS(vm, v_ozfod);
 		GET_VM_STATS(vm, v_swapin);
 		GET_VM_STATS(vm, v_swapout);
 		GET_VM_STATS(vm, v_swappgsin);
 		GET_VM_STATS(vm, v_swappgsout);
 		GET_VM_STATS(vm, v_vnodein);
 		GET_VM_STATS(vm, v_vnodeout);
 		GET_VM_STATS(vm, v_vnodepgsin);
 		GET_VM_STATS(vm, v_vnodepgsout);
 		GET_VM_STATS(vm, v_intrans);
 		GET_VM_STATS(vm, v_reactivated);
 		GET_VM_STATS(vm, v_pdwakeups);
 		GET_VM_STATS(vm, v_pdpages);
 		GET_VM_STATS(vm, v_pdshortfalls);
 		GET_VM_STATS(vm, v_dfree);
 		GET_VM_STATS(vm, v_pfree);
 		GET_VM_STATS(vm, v_tfree);
 		GET_VM_STATS(vm, v_page_size);
 		GET_VM_STATS(vm, v_page_count);
 		GET_VM_STATS(vm, v_free_reserved);
 		GET_VM_STATS(vm, v_free_target);
 		GET_VM_STATS(vm, v_free_min);
 		GET_VM_STATS(vm, v_free_count);
 		GET_VM_STATS(vm, v_wire_count);
 		GET_VM_STATS(vm, v_active_count);
 		GET_VM_STATS(vm, v_inactive_target);
 		GET_VM_STATS(vm, v_inactive_count);
 		GET_VM_STATS(vm, v_laundry_count);
 		GET_VM_STATS(vm, v_pageout_free_min);
 		GET_VM_STATS(vm, v_interrupt_free_min);
 		/*GET_VM_STATS(vm, v_free_severe);*/
 		GET_VM_STATS(vm, v_forks);
 		GET_VM_STATS(vm, v_vforks);
 		GET_VM_STATS(vm, v_rforks);
 		GET_VM_STATS(vm, v_kthreads);
 		GET_VM_STATS(vm, v_forkpages);
 		GET_VM_STATS(vm, v_vforkpages);
 		GET_VM_STATS(vm, v_rforkpages);
 		GET_VM_STATS(vm, v_kthreadpages);
 #undef GET_VM_STATS
 	}
 }
 
 static void
 fill_vmtotal(struct vmtotal *vmtp)
 {
 	if (kd != NULL) {
 		/* XXX fill vmtp */
 		xo_errx(1, "not implemented");
 	} else {
 		size_t size = sizeof(*vmtp);
 		mysysctl("vm.vmtotal", vmtp, &size);
 		if (size != sizeof(*vmtp))
 			xo_errx(1, "vm.total size mismatch");
 	}
 }
 
 /* Determine how many cpu columns, and what index they are in kern.cp_times */
 static int
 getcpuinfo(u_long *maskp, int *maxidp)
 {
 	int maxcpu;
 	int maxid;
 	int ncpus;
 	int i, j;
 	int empty;
 	size_t size;
 	long *times;
 	u_long mask;
 
 	if (kd != NULL)
 		xo_errx(1, "not implemented");
 	mask = 0;
 	ncpus = 0;
 	size = sizeof(maxcpu);
 	mysysctl("kern.smp.maxcpus", &maxcpu, &size);
 	if (size != sizeof(maxcpu))
 		xo_errx(1, "sysctl kern.smp.maxcpus");
 	size = sizeof(long) * maxcpu * CPUSTATES;
 	times = malloc(size);
 	if (times == NULL)
 		xo_err(1, "malloc %zd bytes", size);
 	mysysctl("kern.cp_times", times, &size);
 	maxid = (size / CPUSTATES / sizeof(long)) - 1;
 	for (i = 0; i <= maxid; i++) {
 		empty = 1;
 		for (j = 0; empty && j < CPUSTATES; j++) {
 			if (times[i * CPUSTATES + j] != 0)
 				empty = 0;
 		}
 		if (!empty) {
 			mask |= (1ul << i);
 			ncpus++;
 		}
 	}
 	if (maskp)
 		*maskp = mask;
 	if (maxidp)
 		*maxidp = maxid;
 	return (ncpus);
 }
 
 
 static void
-prthuman(const char *name, u_int64_t val, int size)
+prthuman(const char *name, uint64_t val, int size)
 {
 	char buf[10];
 	int flags;
 	char fmt[128];
 
 	snprintf(fmt, sizeof(fmt), "{:%s/%%*s}", name);
 
 	if (size < 5 || size > 9)
 		xo_errx(1, "doofus");
 	flags = HN_B | HN_NOSPACE | HN_DECIMAL;
 	humanize_number(buf, size, val, "", HN_AUTOSCALE, flags);
 	xo_attr("value", "%ju", (uintmax_t) val);
 	xo_emit(fmt, size, buf);
 }
 
 static int hz, hdrcnt;
 
 static long *cur_cp_times;
 static long *last_cp_times;
 static size_t size_cp_times;
 
 static void
 dovmstat(unsigned int interval, int reps)
 {
 	struct vmtotal total;
 	time_t uptime, halfuptime;
 	struct devinfo *tmp_dinfo;
 	size_t size;
 	int ncpus, maxid;
 	u_long cpumask;
 	int rate_adj;
 
 	uptime = getuptime() / 1000000000LL;
 	halfuptime = uptime / 2;
 	rate_adj = 1;
 	ncpus = 1;
 	maxid = 0;
 
 	/*
 	 * If the user stops the program (control-Z) and then resumes it,
 	 * print out the header again.
 	 */
 	(void)signal(SIGCONT, needhdr);
 
 	/*
 	 * If our standard output is a tty, then install a SIGWINCH handler
 	 * and set wresized so that our first iteration through the main
 	 * vmstat loop will peek at the terminal's current rows to find out
 	 * how many lines can fit in a screenful of output.
 	 */
 	if (isatty(fileno(stdout)) != 0) {
 		wresized = 1;
 		(void)signal(SIGWINCH, needresize);
 	} else {
 		wresized = 0;
 		winlines = VMSTAT_DEFAULT_LINES;
 	}
 
 	if (kd != NULL) {
 		if (namelist[X_STATHZ].n_type != 0 &&
 		    namelist[X_STATHZ].n_value != 0)
 			kread(X_STATHZ, &hz, sizeof(hz));
 		if (!hz)
 			kread(X_HZ, &hz, sizeof(hz));
 	} else {
 		struct clockinfo clockrate;
 
 		size = sizeof(clockrate);
 		mysysctl("kern.clockrate", &clockrate, &size);
 		if (size != sizeof(clockrate))
 			xo_errx(1, "clockrate size mismatch");
 		hz = clockrate.hz;
 	}
 
 	if (Pflag) {
 		ncpus = getcpuinfo(&cpumask, &maxid);
 		size_cp_times = sizeof(long) * (maxid + 1) * CPUSTATES;
 		cur_cp_times = calloc(1, size_cp_times);
 		last_cp_times = calloc(1, size_cp_times);
 	}
 	for (hdrcnt = 1;;) {
 		if (!--hdrcnt)
 			printhdr(maxid, cpumask);
 		if (kd != NULL) {
 			if (kvm_getcptime(kd, cur.cp_time) < 0)
 				xo_errx(1, "kvm_getcptime: %s", kvm_geterr(kd));
 		} else {
 			size = sizeof(cur.cp_time);
 			mysysctl("kern.cp_time", &cur.cp_time, &size);
 			if (size != sizeof(cur.cp_time))
 				xo_errx(1, "cp_time size mismatch");
 		}
 		if (Pflag) {
 			size = size_cp_times;
 			mysysctl("kern.cp_times", cur_cp_times, &size);
 			if (size != size_cp_times)
 				xo_errx(1, "cp_times mismatch");
 		}
 
 		tmp_dinfo = last.dinfo;
 		last.dinfo = cur.dinfo;
 		cur.dinfo = tmp_dinfo;
 		last.snap_time = cur.snap_time;
 
 		/*
 		 * Here what we want to do is refresh our device stats.
 		 * getdevs() returns 1 when the device list has changed.
 		 * If the device list has changed, we want to go through
 		 * the selection process again, in case a device that we
 		 * were previously displaying has gone away.
 		 */
 		switch (devstat_getdevs(NULL, &cur)) {
 		case -1:
 			xo_errx(1, "%s", devstat_errbuf);
 			break;
 		case 1: {
 			int retval;
 
 			num_devices = cur.dinfo->numdevs;
 			generation = cur.dinfo->generation;
 
 			retval = devstat_selectdevs(&dev_select, &num_selected,
 					    &num_selections, &select_generation,
 					    generation, cur.dinfo->devices,
 					    num_devices, matches, num_matches,
 					    specified_devices,
 					    num_devices_specified, select_mode,
 					    maxshowdevs, 0);
 			switch (retval) {
 			case -1:
 				xo_errx(1, "%s", devstat_errbuf);
 				break;
 			case 1:
 				printhdr(maxid, cpumask);
 				break;
 			default:
 				break;
 			}
 		}
 		default:
 			break;
 		}
 
 		fill_vmmeter(&sum);
 		fill_vmtotal(&total);
 		xo_open_container("processes");
 		xo_emit("{:runnable/%1d} {:waiting/%ld} "
 		        "{:swapped-out/%ld}",
 		    total.t_rq - 1, total.t_dw + total.t_pw, total.t_sw);
 		xo_close_container("processes");
 		xo_open_container("memory");
-#define vmstat_pgtok(a) ((a) * (sum.v_page_size >> 10))
+#define vmstat_pgtok(a) ((uintmax_t)(a) * (sum.v_page_size >> 10))
 #define	rate(x)	(((x) * rate_adj + halfuptime) / uptime)	/* round */
 		if (hflag) {
 			xo_emit("");
 			prthuman("available-memory",
-			         total.t_avm * (u_int64_t)sum.v_page_size, 5);
+			         total.t_avm * (uint64_t)sum.v_page_size, 5);
 			xo_emit(" ");
 			prthuman("free-memory",
-			         total.t_free * (u_int64_t)sum.v_page_size, 5);
+			         total.t_free * (uint64_t)sum.v_page_size, 5);
 			xo_emit(" ");
 		} else {
 			xo_emit(" ");
-			xo_emit("{:available-memory/%7d}",
+			xo_emit("{:available-memory/%7ju}",
 			        vmstat_pgtok(total.t_avm));
 			xo_emit(" ");
-			xo_emit("{:free-memory/%7d}",
+			xo_emit("{:free-memory/%7ju}",
 			        vmstat_pgtok(total.t_free));
 			xo_emit(" ");
 		}
 		xo_emit("{:total-page-faults/%5lu} ",
 		        (unsigned long)rate(sum.v_vm_faults -
 		        osum.v_vm_faults));
 		xo_close_container("memory");
 
 		xo_open_container("paging-rates");
 		xo_emit("{:page-reactivated/%3lu} ",
 		    (unsigned long)rate(sum.v_reactivated - osum.v_reactivated));
 		xo_emit("{:paged-in/%3lu} ",
 		    (unsigned long)rate(sum.v_swapin + sum.v_vnodein -
 		    (osum.v_swapin + osum.v_vnodein)));
 		xo_emit("{:paged-out/%3lu} ",
 		    (unsigned long)rate(sum.v_swapout + sum.v_vnodeout -
 		    (osum.v_swapout + osum.v_vnodeout)));
 		xo_emit("{:freed/%5lu} ",
 		    (unsigned long)rate(sum.v_tfree - osum.v_tfree));
 		xo_emit("{:scanned/%4lu} ",
 		    (unsigned long)rate(sum.v_pdpages - osum.v_pdpages));
 		xo_close_container("paging-rates");
 
 		devstats();
 		xo_open_container("fault-rates");
 		xo_emit("{:interrupts/%4lu} {:system-calls/%5lu} "
 		        "{:context-switches/%5u}",
 		    (unsigned long)rate(sum.v_intr - osum.v_intr),
 		    (unsigned long)rate(sum.v_syscall - osum.v_syscall),
 		    (unsigned long)rate(sum.v_swtch - osum.v_swtch));
 		xo_close_container("fault-rates");
 		if (Pflag)
 			pcpustats(ncpus, cpumask, maxid);
 		else
 			cpustats();
 		xo_emit("\n");
 		xo_flush();
 		if (reps >= 0 && --reps <= 0)
 			break;
 		osum = sum;
 		uptime = interval;
 		rate_adj = 1000;
 		/*
 		 * We round upward to avoid losing low-frequency events
 		 * (i.e., >= 1 per interval but < 1 per millisecond).
 		 */
 		if (interval != 1)
 			halfuptime = (uptime + 1) / 2;
 		else
 			halfuptime = 0;
 		(void)usleep(interval * 1000);
 	}
 }
 
 static void
 printhdr(int maxid, u_long cpumask)
 {
 	int i, num_shown;
 
 	num_shown = MIN(num_selected, maxshowdevs);
 	if (hflag) {
 		xo_emit("{T:procs}  {T:memory}       {T:/page%*s}", 19, "");
 	} else {
 		xo_emit("{T:procs}     {T:memory}        {T:/page%*s}", 19, "");
 	}
 	if (num_shown > 1)
 		xo_emit(" {T:/disks %*s}", num_shown * 4 - 7, ""); 
 	else if (num_shown == 1)
 		xo_emit("   {T:disks}");
 	xo_emit("   {T:faults}      ");
 	if (Pflag) {
 		for (i = 0; i <= maxid; i++) {
 			if (cpumask & (1ul << i))
 				xo_emit("  {T:/cpu%d}   ", i);
 		}
 		xo_emit("\n");
 	} else
 		xo_emit("   {T:cpu}\n");
 	if (hflag) {
 		xo_emit("{T:r} {T:b} {T:w}  {T:avm}   {T:fre}   {T:flt}  {T:re}  {T:pi}  {T:po}    {T:fr}   {T:sr} ");
 	} else {
 		xo_emit("{T:r} {T:b} {T:w}     {T:avm}     {T:fre}  {T:flt}  {T:re}  {T:pi}  {T:po}    {T:fr}   {T:sr} ");
 	}
 	for (i = 0; i < num_devices; i++)
 		if ((dev_select[i].selected)
 		 && (dev_select[i].selected <= maxshowdevs))
 			xo_emit("{T:/%c%c%d} ", dev_select[i].device_name[0],
 				     dev_select[i].device_name[1],
 				     dev_select[i].unit_number);
 	xo_emit("  {T:in}    {T:sy}    {T:cs}");
 	if (Pflag) {
 		for (i = 0; i <= maxid; i++) {
 			if (cpumask & (1ul << i))
 				xo_emit(" {T:us} {T:sy} {T:id}");
 		}
 		xo_emit("\n");
 	} else
 		xo_emit(" {T:us} {T:sy} {T:id}\n");
 	if (wresized != 0)
 		doresize();
 	hdrcnt = winlines;
 }
 
 /*
  * Force a header to be prepended to the next output.
  */
 static void
 needhdr(int dummy __unused)
 {
 
 	hdrcnt = 1;
 }
 
 /*
  * When the terminal is resized, force an update of the maximum number of rows
  * printed between each header repetition.  Then force a new header to be
  * prepended to the next output.
  */
 void
 needresize(int signo)
 {
 
 	wresized = 1;
 	hdrcnt = 1;
 }
 
 /*
  * Update the global `winlines' count of terminal rows.
  */
 void
 doresize(void)
 {
 	int status;
 	struct winsize w;
 
 	for (;;) {
 		status = ioctl(fileno(stdout), TIOCGWINSZ, &w);
 		if (status == -1 && errno == EINTR)
 			continue;
 		else if (status == -1)
 			xo_err(1, "ioctl");
 		if (w.ws_row > 3)
 			winlines = w.ws_row - 3;
 		else
 			winlines = VMSTAT_DEFAULT_LINES;
 		break;
 	}
 
 	/*
 	 * Inhibit doresize() calls until we are rescheduled by SIGWINCH.
 	 */
 	wresized = 0;
 }
 
 #ifdef notyet
 static void
 dotimes(void)
 {
 	unsigned int pgintime, rectime;
 
 	kread(X_REC, &rectime, sizeof(rectime));
 	kread(X_PGIN, &pgintime, sizeof(pgintime));
 	kread(X_SUM, &sum, sizeof(sum));
 	xo_emit("{:page-reclaims/%u} {N:reclaims}, "
 		"{:reclaim-time/%u} {N:total time (usec)}\n",
 	    sum.v_pgrec, rectime);
 	xo_emit("{L:average}: {:reclaim-average/%u} {N:usec \\/ reclaim}\n",
 		rectime / sum.v_pgrec);
 	xo_emit("\n");
 	xo_emit("{:page-ins/%u} {N:page ins}, "
 		"{:page-in-time/%u} {N:total time (msec)}\n",
 	    sum.v_pgin, pgintime / 10);
 	xo_emit("{L:average}: {:average/%8.1f} {N:msec \\/ page in}\n",
 	    pgintime / (sum.v_pgin * 10.0));
 }
 #endif
 
 static long
 pct(long top, long bot)
 {
 	long ans;
 
 	if (bot == 0)
 		return(0);
 	ans = (quad_t)top * 100 / bot;
 	return (ans);
 }
 
 #define	PCT(top, bot) pct((long)(top), (long)(bot))
 
 static void
 dosum(void)
 {
 	struct nchstats lnchstats;
 	long nchtotal;
 
 	fill_vmmeter(&sum);
 	xo_open_container("summary-statistics");
 	xo_emit("{:context-switches/%9u} {N:cpu context switches}\n",
 		sum.v_swtch);
 	xo_emit("{:interrupts/%9u} {N:device interrupts}\n",
 		sum.v_intr);
 	xo_emit("{:software-interrupts/%9u} {N:software interrupts}\n",
 		sum.v_soft);
 	xo_emit("{:traps/%9u} {N:traps}\n", sum.v_trap);
 	xo_emit("{:system-calls/%9u} {N:system calls}\n",
 		sum.v_syscall);
 	xo_emit("{:kernel-threads/%9u} {N:kernel threads created}\n",
 		sum.v_kthreads);
 	xo_emit("{:forks/%9u} {N: fork() calls}\n", sum.v_forks);
 	xo_emit("{:vforks/%9u} {N:vfork() calls}\n",
 		sum.v_vforks);
 	xo_emit("{:rforks/%9u} {N:rfork() calls}\n",
 		sum.v_rforks);
 	xo_emit("{:swap-ins/%9u} {N:swap pager pageins}\n",
 		sum.v_swapin);
 	xo_emit("{:swap-in-pages/%9u} {N:swap pager pages paged in}\n",
 		sum.v_swappgsin);
 	xo_emit("{:swap-outs/%9u} {N:swap pager pageouts}\n",
 		sum.v_swapout);
 	xo_emit("{:swap-out-pages/%9u} {N:swap pager pages paged out}\n",
 		sum.v_swappgsout);
 	xo_emit("{:vnode-page-ins/%9u} {N:vnode pager pageins}\n",
 		sum.v_vnodein);
 	xo_emit("{:vnode-page-in-pages/%9u} {N:vnode pager pages paged in}\n",
 		sum.v_vnodepgsin);
 	xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pageouts}\n",
 		sum.v_vnodeout);
 	xo_emit("{:vnode-page-out-pages/%9u} {N:vnode pager pages paged out}\n",
 		sum.v_vnodepgsout);
 	xo_emit("{:page-daemon-wakeups/%9u} {N:page daemon wakeups}\n",
 		sum.v_pdwakeups);
 	xo_emit("{:page-daemon-pages/%9u} {N:pages examined by the page daemon}\n",
 		sum.v_pdpages);
 	xo_emit("{:page-reclamation-shortfalls/%9u} {N:clean page reclamation shortfalls}\n",
 		sum.v_pdshortfalls);
 	xo_emit("{:reactivated/%9u} {N:pages reactivated by the page daemon}\n",
 		sum.v_reactivated);
 	xo_emit("{:copy-on-write-faults/%9u} {N:copy-on-write faults}\n",
 		sum.v_cow_faults);
 	xo_emit("{:copy-on-write-optimized-faults/%9u} {N:copy-on-write optimized faults}\n",
 		sum.v_cow_optim);
 	xo_emit("{:zero-fill-pages/%9u} {N:zero fill pages zeroed}\n",
 		sum.v_zfod);
 	xo_emit("{:zero-fill-prezeroed/%9u} {N:zero fill pages prezeroed}\n",
 		sum.v_ozfod);
 	xo_emit("{:intransit-blocking/%9u} {N:intransit blocking page faults}\n",
 		sum.v_intrans);
 	xo_emit("{:total-faults/%9u} {N:total VM faults taken}\n",
 		sum.v_vm_faults);
 	xo_emit("{:faults-requiring-io/%9u} {N:page faults requiring I\\/O}\n",
 		sum.v_io_faults);
 	xo_emit("{:faults-from-thread-creation/%9u} {N:pages affected by kernel thread creation}\n",
 		sum.v_kthreadpages);
 	xo_emit("{:faults-from-fork/%9u} {N:pages affected by  fork}()\n",
 		sum.v_forkpages);
 	xo_emit("{:faults-from-vfork/%9u} {N:pages affected by vfork}()\n",
 		sum.v_vforkpages);
 	xo_emit("{:pages-rfork/%9u} {N:pages affected by rfork}()\n",
 		sum.v_rforkpages);
 	xo_emit("{:pages-freed/%9u} {N:pages freed}\n",
 		sum.v_tfree);
 	xo_emit("{:pages-freed-by-daemon/%9u} {N:pages freed by daemon}\n",
 		sum.v_dfree);
 	xo_emit("{:pages-freed-on-exit/%9u} {N:pages freed by exiting processes}\n",
 		sum.v_pfree);
 	xo_emit("{:active-pages/%9u} {N:pages active}\n",
 		sum.v_active_count);
 	xo_emit("{:inactive-pages/%9u} {N:pages inactive}\n",
 		sum.v_inactive_count);
 	xo_emit("{:laundry-pages/%9u} {N:pages in the laundry queue}\n",
 		sum.v_laundry_count);
 	xo_emit("{:wired-pages/%9u} {N:pages wired down}\n",
 		sum.v_wire_count);
 	xo_emit("{:free-pages/%9u} {N:pages free}\n",
 		sum.v_free_count);
 	xo_emit("{:bytes-per-page/%9u} {N:bytes per page}\n", sum.v_page_size);
 	if (kd != NULL) {
 		kread(X_NCHSTATS, &lnchstats, sizeof(lnchstats));
 	} else {
 		size_t size = sizeof(lnchstats);
 		mysysctl("vfs.cache.nchstats", &lnchstats, &size);
 		if (size != sizeof(lnchstats))
 			xo_errx(1, "vfs.cache.nchstats size mismatch");
 	}
 	nchtotal = lnchstats.ncs_goodhits + lnchstats.ncs_neghits +
 	    lnchstats.ncs_badhits + lnchstats.ncs_falsehits +
 	    lnchstats.ncs_miss + lnchstats.ncs_long;
 	xo_emit("{:total-name-lookups/%9ld} {N:total name lookups}\n",
 	        nchtotal);
 	xo_emit("{P:/%9s} {N:cache hits} "
 	        "({:positive-cache-hits/%ld}% pos + "
 	        "{:negative-cache-hits/%ld}% {N:neg}) "
 	        "system {:cache-hit-percent/%ld}% per-directory\n",
 	    "", PCT(lnchstats.ncs_goodhits, nchtotal),
 	    PCT(lnchstats.ncs_neghits, nchtotal),
 	    PCT(lnchstats.ncs_pass2, nchtotal));
 	xo_emit("{P:/%9s} {L:deletions} {:deletions/%ld}%, "
 	        "{L:falsehits} {:false-hits/%ld}%, "
 	        "{L:toolong} {:too-long/%ld}%\n", "",
 	    PCT(lnchstats.ncs_badhits, nchtotal),
 	    PCT(lnchstats.ncs_falsehits, nchtotal),
 	    PCT(lnchstats.ncs_long, nchtotal));
 	xo_close_container("summary-statistics");
 }
 
 static void
 doforkst(void)
 {
 	fill_vmmeter(&sum);
 	xo_open_container("fork-statistics");
 	xo_emit("{:fork/%u} {N:forks}, {:fork-pages/%u} {N:pages}, "
 		"{L:average} {:fork-average/%.2f}\n",
 	    sum.v_forks, sum.v_forkpages,
 	    sum.v_forks == 0 ? 0.0 :
 	    (double)sum.v_forkpages / sum.v_forks);
 	xo_emit("{:vfork/%u} {N:vforks}, {:vfork-pages/%u} {N:pages}, "
 		"{L:average} {:vfork-average/%.2f}\n",
 	    sum.v_vforks, sum.v_vforkpages,
 	    sum.v_vforks == 0 ? 0.0 :
 	    (double)sum.v_vforkpages / sum.v_vforks);
 	xo_emit("{:rfork/%u} {N:rforks}, {:rfork-pages/%u} {N:pages}, "
 		"{L:average} {:rfork-average/%.2f}\n",
 	    sum.v_rforks, sum.v_rforkpages,
 	    sum.v_rforks == 0 ? 0.0 :
 	    (double)sum.v_rforkpages / sum.v_rforks);
 	xo_close_container("fork-statistics");
 }
 
 static void
 devstats(void)
 {
 	int dn, state;
 	long double transfers_per_second;
 	long double busy_seconds;
 	long tmp;
 
 	for (state = 0; state < CPUSTATES; ++state) {
 		tmp = cur.cp_time[state];
 		cur.cp_time[state] -= last.cp_time[state];
 		last.cp_time[state] = tmp;
 	}
 
 	busy_seconds = cur.snap_time - last.snap_time;
 
 	xo_open_list("device");
 	for (dn = 0; dn < num_devices; dn++) {
 		int di;
 
 		if ((dev_select[dn].selected == 0)
 		 || (dev_select[dn].selected > maxshowdevs))
 			continue;
 
 		di = dev_select[dn].position;
 
 		if (devstat_compute_statistics(&cur.dinfo->devices[di],
 		    &last.dinfo->devices[di], busy_seconds,
 		    DSM_TRANSFERS_PER_SECOND, &transfers_per_second,
 		    DSM_NONE) != 0)
 			xo_errx(1, "%s", devstat_errbuf);
 
 		xo_open_instance("device");
 		xo_emit("{ekq:name/%c%c%d}{:transfers/%3.0Lf} ",
 			dev_select[dn].device_name[0],
 			dev_select[dn].device_name[1],
 			dev_select[dn].unit_number,
 			transfers_per_second);
 		xo_close_instance("device");
 	}
 	xo_close_list("device");
 }
 
 static void
 percent(const char *name, double pct, int *over)
 {
 	char buf[10];
 	char fmt[128];
 	int l;
 
 	snprintf(fmt, sizeof(fmt), " {:%s/%%*s}", name);
 	l = snprintf(buf, sizeof(buf), "%.0f", pct);
 	if (l == 1 && *over) {
 		xo_emit(fmt, 1, buf);
 		(*over)--;
 	} else
 		xo_emit(fmt, 2, buf);
 	if (l > 2)
 		(*over)++;
 }
 
 static void
 cpustats(void)
 {
 	int state, over;
 	double lpct, total;
 
 	total = 0;
 	for (state = 0; state < CPUSTATES; ++state)
 		total += cur.cp_time[state];
 	if (total)
 		lpct = 100.0 / total;
 	else
 		lpct = 0.0;
 	over = 0;
 	xo_open_container("cpu-statistics");
 	percent("user", (cur.cp_time[CP_USER] + cur.cp_time[CP_NICE]) * lpct, &over);
 	percent("system", (cur.cp_time[CP_SYS] + cur.cp_time[CP_INTR]) * lpct, &over);
 	percent("idle", cur.cp_time[CP_IDLE] * lpct, &over);
 	xo_close_container("cpu-statistics");
 }
 
 static void
 pcpustats(int ncpus, u_long cpumask, int maxid)
 {
 	int state, i;
 	double lpct, total;
 	long tmp;
 	int over;
 
 	/* devstats does this for cp_time */
 	for (i = 0; i <= maxid; i++) {
 		if ((cpumask & (1ul << i)) == 0)
 			continue;
 		for (state = 0; state < CPUSTATES; ++state) {
 			tmp = cur_cp_times[i * CPUSTATES + state];
 			cur_cp_times[i * CPUSTATES + state] -= last_cp_times[i *
 			    CPUSTATES + state];
 			last_cp_times[i * CPUSTATES + state] = tmp;
 		}
 	}
 
 	over = 0;
 	xo_open_list("cpu");
 	for (i = 0; i <= maxid; i++) {
 		if ((cpumask & (1ul << i)) == 0)
 			continue;
 		xo_open_instance("cpu");
 		xo_emit("{ke:name/%d}", i);
 		total = 0;
 		for (state = 0; state < CPUSTATES; ++state)
 			total += cur_cp_times[i * CPUSTATES + state];
 		if (total)
 			lpct = 100.0 / total;
 		else
 			lpct = 0.0;
 		percent("user", (cur_cp_times[i * CPUSTATES + CP_USER] +
 			 cur_cp_times[i * CPUSTATES + CP_NICE]) * lpct, &over);
 		percent("system", (cur_cp_times[i * CPUSTATES + CP_SYS] +
 			 cur_cp_times[i * CPUSTATES + CP_INTR]) * lpct, &over);
 		percent("idle", cur_cp_times[i * CPUSTATES + CP_IDLE] * lpct,
 			&over);
 		xo_close_instance("cpu");
 	}
 	xo_close_list("cpu");
 }
 
 static unsigned int
 read_intrcnts(unsigned long **intrcnts)
 {
 	size_t intrcntlen;
 
 	if (kd != NULL) {
 		kread(X_SINTRCNT, &intrcntlen, sizeof(intrcntlen));
 		if ((*intrcnts = malloc(intrcntlen)) == NULL)
 			err(1, "malloc()");
 		kread(X_INTRCNT, *intrcnts, intrcntlen);
 	} else {
 		for (*intrcnts = NULL, intrcntlen = 1024; ; intrcntlen *= 2) {
 			*intrcnts = reallocf(*intrcnts, intrcntlen);
 			if (*intrcnts == NULL)
 				err(1, "reallocf()");
 			if (mysysctl("hw.intrcnt", *intrcnts, &intrcntlen) == 0)
 				break;
 		}
 	}
 
 	return (intrcntlen / sizeof(unsigned long));
 }
 
 static void
 print_intrcnts(unsigned long *intrcnts, unsigned long *old_intrcnts,
 		char *intrnames, unsigned int nintr,
 		size_t istrnamlen, long long period_ms)
 {
 	unsigned long *intrcnt, *old_intrcnt;
 	uint64_t inttotal, old_inttotal, total_count, total_rate;
 	char* intrname;
 	unsigned int i;
 
 	inttotal = 0;
 	old_inttotal = 0;
 	intrname = intrnames;
 	xo_open_list("interrupt");
 	for (i = 0, intrcnt=intrcnts, old_intrcnt=old_intrcnts; i < nintr; i++) {
 		if (intrname[0] != '\0' && (*intrcnt != 0 || aflag)) {
 			unsigned long count, rate;
 
 			count = *intrcnt - *old_intrcnt;
 			rate = (count * 1000 + period_ms / 2) / period_ms;
 			xo_open_instance("interrupt");
 			xo_emit("{d:name/%-*s}{ket:name/%s} "
 			    "{:total/%20lu} {:rate/%10lu}\n",
 			    (int)istrnamlen, intrname,
 			    intrname, count, rate);
 			xo_close_instance("interrupt");
 		}
 		intrname += strlen(intrname) + 1;
 		inttotal += *intrcnt++;
 		old_inttotal += *old_intrcnt++;
 	}
 	total_count = inttotal - old_inttotal;
 	total_rate = (total_count * 1000 + period_ms / 2) / period_ms;
 	xo_close_list("interrupt");
 	xo_emit("{L:/%-*s} {:total-interrupts/%20" PRIu64 "} "
 	        "{:total-rate/%10" PRIu64 "}\n", (int)istrnamlen,
 	        "Total", total_count, total_rate);
 }
 
 static void
 dointr(unsigned int interval, int reps)
 {
 	unsigned long *intrcnts;
 	long long uptime, period_ms;
 	unsigned long *old_intrcnts = NULL;
 	size_t clen, inamlen, istrnamlen;
 	char *intrnames, *intrname;
 
 	uptime = getuptime();
 
 	/* Get the names of each interrupt source */
 	if (kd != NULL) {
 		kread(X_SINTRNAMES, &inamlen, sizeof(inamlen));
 		if ((intrnames = malloc(inamlen)) == NULL)
 			xo_err(1, "malloc()");
 		kread(X_INTRNAMES, intrnames, inamlen);
 	} else {
 		for (intrnames = NULL, inamlen = 1024; ; inamlen *= 2) {
 			if ((intrnames = reallocf(intrnames, inamlen)) == NULL)
 				xo_err(1, "reallocf()");
 			if (mysysctl("hw.intrnames", intrnames, &inamlen) == 0)
 				break;
 		}
 	}
 
 	/* Determine the length of the longest interrupt name */
 	intrname = intrnames;
 	istrnamlen = strlen("interrupt");
 	while(*intrname != '\0') {
 		clen = strlen(intrname);
 		if (clen > istrnamlen)
 			istrnamlen = clen;
 		intrname += strlen(intrname) + 1;
 	}
 	xo_emit("{T:/%-*s} {T:/%20s} {T:/%10s}\n",
 	        (int)istrnamlen, "interrupt", "total", "rate");
 
 	/* 
 	 * Loop reps times printing differential interrupt counts.  If reps is
 	 * zero, then run just once, printing total counts
 	 */
 	xo_open_container("interrupt-statistics");
 
 	period_ms = uptime / 1000000;
 	while(1) {
 		unsigned int nintr;
 		long long old_uptime;
 
 		nintr = read_intrcnts(&intrcnts);
 		/* 
 		 * Initialize old_intrcnts to 0 for the first pass, so
 		 * print_intrcnts will print total interrupts since boot
 		 */
 		if (old_intrcnts == NULL) {
 			old_intrcnts = calloc(nintr, sizeof(unsigned long));
 			if (old_intrcnts == NULL)
 				xo_err(1, "calloc()");
 		}
 
 		print_intrcnts(intrcnts, old_intrcnts, intrnames, nintr,
 		    istrnamlen, period_ms);
 		xo_flush();
 
 		free(old_intrcnts);
 		old_intrcnts = intrcnts;
 		if (reps >= 0 && --reps <= 0)
 			break;
 		usleep(interval * 1000);
 		old_uptime = uptime;
 		uptime = getuptime();
 		period_ms = (uptime - old_uptime) / 1000000;
 	}
 
 	xo_close_container("interrupt-statistics");
 }
 
 static void
 domemstat_malloc(void)
 {
 	struct memory_type_list *mtlp;
 	struct memory_type *mtp;
 	int error, first, i;
 
 	mtlp = memstat_mtl_alloc();
 	if (mtlp == NULL) {
 		xo_warn("memstat_mtl_alloc");
 		return;
 	}
 	if (kd == NULL) {
 		if (memstat_sysctl_malloc(mtlp, 0) < 0) {
 			xo_warnx("memstat_sysctl_malloc: %s",
 			    memstat_strerror(memstat_mtl_geterror(mtlp)));
 			return;
 		}
 	} else {
 		if (memstat_kvm_malloc(mtlp, kd) < 0) {
 			error = memstat_mtl_geterror(mtlp);
 			if (error == MEMSTAT_ERROR_KVM)
 				xo_warnx("memstat_kvm_malloc: %s",
 				    kvm_geterr(kd));
 			else
 				xo_warnx("memstat_kvm_malloc: %s",
 				    memstat_strerror(error));
 		}
 	}
 	xo_open_container("malloc-statistics");
 	xo_emit("{T:/%13s} {T:/%5s} {T:/%6s} {T:/%7s} {T:/%8s}  {T:Size(s)}\n",
 		"Type", "InUse", "MemUse", "HighUse", "Requests");
 	xo_open_list("memory");
 	for (mtp = memstat_mtl_first(mtlp); mtp != NULL;
 	    mtp = memstat_mtl_next(mtp)) {
 		if (memstat_get_numallocs(mtp) == 0 &&
 		    memstat_get_count(mtp) == 0)
 			continue;
 		xo_open_instance("memory");
 		xo_emit("{k:type/%13s/%s} {:in-use/%5" PRIu64 "} "
 			"{:memory-use/%5" PRIu64 "}{U:K} {:high-use/%7s} "
 			"{:requests/%8" PRIu64 "}  ",
 		    memstat_get_name(mtp), memstat_get_count(mtp),
 		    (memstat_get_bytes(mtp) + 1023) / 1024, "-",
 		    memstat_get_numallocs(mtp));
 		first = 1;
 		xo_open_list("size");
 		for (i = 0; i < 32; i++) {
 			if (memstat_get_sizemask(mtp) & (1 << i)) {
 				if (!first)
 					xo_emit(",");
 				xo_emit("{l:size/%d}", 1 << (i + 4));
 				first = 0;
 			}
 		}
 		xo_close_list("size");
 		xo_close_instance("memory");
 		xo_emit("\n");
 	}
 	xo_close_list("memory");
 	xo_close_container("malloc-statistics");
 	memstat_mtl_free(mtlp);
 }
 
 static void
 domemstat_zone(void)
 {
 	struct memory_type_list *mtlp;
 	struct memory_type *mtp;
 	char name[MEMTYPE_MAXNAME + 1];
 	int error;
 
 	mtlp = memstat_mtl_alloc();
 	if (mtlp == NULL) {
 		xo_warn("memstat_mtl_alloc");
 		return;
 	}
 	if (kd == NULL) {
 		if (memstat_sysctl_uma(mtlp, 0) < 0) {
 			xo_warnx("memstat_sysctl_uma: %s",
 			    memstat_strerror(memstat_mtl_geterror(mtlp)));
 			return;
 		}
 	} else {
 		if (memstat_kvm_uma(mtlp, kd) < 0) {
 			error = memstat_mtl_geterror(mtlp);
 			if (error == MEMSTAT_ERROR_KVM)
 				xo_warnx("memstat_kvm_uma: %s",
 				    kvm_geterr(kd));
 			else
 				xo_warnx("memstat_kvm_uma: %s",
 				    memstat_strerror(error));
 		}
 	}
 	xo_open_container("memory-zone-statistics");
 	xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} "
 		"{T:/%4s} {T:/%4s}\n\n", "ITEM", "SIZE",
 		"LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP");
 	xo_open_list("zone");
 	for (mtp = memstat_mtl_first(mtlp); mtp != NULL;
 	    mtp = memstat_mtl_next(mtp)) {
 		strlcpy(name, memstat_get_name(mtp), MEMTYPE_MAXNAME);
 		strcat(name, ":");
 		xo_open_instance("zone");
 		xo_emit("{d:name/%-20s}{ke:name/%s} {:size/%6" PRIu64 "}, "
 			"{:limit/%6" PRIu64 "},{:used/%8" PRIu64 "},"
 			"{:free/%8" PRIu64 "},{:requests/%8" PRIu64 "},"
 			"{:fail/%4" PRIu64 "},{:sleep/%4" PRIu64 "}\n", name,
 			memstat_get_name(mtp),
 			memstat_get_size(mtp), memstat_get_countlimit(mtp),
 			memstat_get_count(mtp), memstat_get_free(mtp),
 			memstat_get_numallocs(mtp), memstat_get_failures(mtp),
 			memstat_get_sleeps(mtp));
 		xo_close_instance("zone");
 	}
 	memstat_mtl_free(mtlp);
 	xo_close_list("zone");
 	xo_close_container("memory-zone-statistics");
 	xo_emit("\n");
 }
 
 static void
 display_object(struct kinfo_vmobject *kvo)
 {
 	const char *str;
 
 	xo_open_instance("object");
-        xo_emit("{:resident/%5jd} ", (uintmax_t)kvo->kvo_resident);
-	xo_emit("{:active/%5jd} ", (uintmax_t)kvo->kvo_active);
-	xo_emit("{:inactive/%5jd} ", (uintmax_t)kvo->kvo_inactive);
+	xo_emit("{:resident/%5ju} ", (uintmax_t)kvo->kvo_resident);
+	xo_emit("{:active/%5ju} ", (uintmax_t)kvo->kvo_active);
+	xo_emit("{:inactive/%5ju} ", (uintmax_t)kvo->kvo_inactive);
 	xo_emit("{:refcount/%3d} ", kvo->kvo_ref_count);
 	xo_emit("{:shadowcount/%3d} ", kvo->kvo_shadow_count);
 	switch (kvo->kvo_memattr) {
 #ifdef VM_MEMATTR_UNCACHEABLE
 	case VM_MEMATTR_UNCACHEABLE:
 		str = "UC";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_COMBINING
 	case VM_MEMATTR_WRITE_COMBINING:
 		str = "WC";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_THROUGH
 	case VM_MEMATTR_WRITE_THROUGH:
 		str = "WT";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_PROTECTED
 	case VM_MEMATTR_WRITE_PROTECTED:
 		str = "WP";
 		break;
 #endif
 #ifdef VM_MEMATTR_WRITE_BACK
 	case VM_MEMATTR_WRITE_BACK:
 		str = "WB";
 		break;
 #endif
 #ifdef VM_MEMATTR_WEAK_UNCACHEABLE
 	case VM_MEMATTR_WEAK_UNCACHEABLE:
 		str = "UC-";
 		break;
 #endif
 #ifdef VM_MEMATTR_WB_WA
 	case VM_MEMATTR_WB_WA:
 		str = "WB";
 		break;
 #endif
 #ifdef VM_MEMATTR_NOCACHE
 	case VM_MEMATTR_NOCACHE:
 		str = "NC";
 		break;
 #endif
 #ifdef VM_MEMATTR_DEVICE
 	case VM_MEMATTR_DEVICE:
 		str = "DEV";
 		break;
 #endif
 #ifdef VM_MEMATTR_CACHEABLE
 	case VM_MEMATTR_CACHEABLE:
 		str = "C";
 		break;
 #endif
 #ifdef VM_MEMATTR_PREFETCHABLE
 	case VM_MEMATTR_PREFETCHABLE:
 		str = "PRE";
 		break;
 #endif
 	default:
 		str = "??";
 		break;
 	}
 	xo_emit("{:attribute/%-3s} ", str);
 	switch (kvo->kvo_type) {
 	case KVME_TYPE_NONE:
 		str = "--";
 		break;
 	case KVME_TYPE_DEFAULT:
 		str = "df";
 		break;
 	case KVME_TYPE_VNODE:
 		str = "vn";
 		break;
 	case KVME_TYPE_SWAP:
 		str = "sw";
 		break;
 	case KVME_TYPE_DEVICE:
 		str = "dv";
 		break;
 	case KVME_TYPE_PHYS:
 		str = "ph";
 		break;
 	case KVME_TYPE_DEAD:
 		str = "dd";
 		break;
 	case KVME_TYPE_SG:
 		str = "sg";
 		break;
 	case KVME_TYPE_MGTDEVICE:
 		str = "md";
 		break;
 	case KVME_TYPE_UNKNOWN:
 	default:
 		str = "??";
 		break;
 	}
 	xo_emit("{:type/%-2s} ", str);
 	xo_emit("{:path/%-s}\n", kvo->kvo_path);
 	xo_close_instance("object");
 }
 
 static void
 doobjstat(void)
 {
 	struct kinfo_vmobject *kvo;
 	int cnt, i;
 
 	kvo = kinfo_getvmobject(&cnt);
 	if (kvo == NULL) {
 		xo_warn("Failed to fetch VM object list");
 		return;
 	}
 	xo_emit("{T:RES/%5s} {T:ACT/%5s} {T:INACT/%5s} {T:REF/%3s} {T:SHD/%3s} "
 	        "{T:CM/%3s} {T:TP/%2s} {T:PATH/%s}\n");
 	xo_open_list("object");
 	for (i = 0; i < cnt; i++)
 		display_object(&kvo[i]);
 	free(kvo);
 	xo_close_list("object");
 }
 
 /*
  * kread reads something from the kernel, given its nlist index.
  */
 static void
 kreado(int nlx, void *addr, size_t size, size_t offset)
 {
 	const char *sym;
 
 	if (namelist[nlx].n_type == 0 || namelist[nlx].n_value == 0) {
 		sym = namelist[nlx].n_name;
 		if (*sym == '_')
 			++sym;
 		xo_errx(1, "symbol %s not defined", sym);
 	}
 	if ((size_t)kvm_read(kd, namelist[nlx].n_value + offset, addr,
 	    size) != size) {
 		sym = namelist[nlx].n_name;
 		if (*sym == '_')
 			++sym;
 		xo_errx(1, "%s: %s", sym, kvm_geterr(kd));
 	}
 }
 
 static void
 kread(int nlx, void *addr, size_t size)
 {
 	kreado(nlx, addr, size, 0);
 }
 
 static char *
 kgetstr(const char *strp)
 {
 	int n = 0, size = 1;
 	char *ret = NULL;
 
 	do {
 		if (size == n + 1) {
 			ret = realloc(ret, size);
 			if (ret == NULL)
 				xo_err(1, "%s: realloc", __func__);
 			size *= 2;
 		}
 		if (kvm_read(kd, (u_long)strp + n, &ret[n], 1) != 1)
 			xo_errx(1, "%s: %s", __func__, kvm_geterr(kd));
 	} while (ret[n++] != '\0');
 	return (ret);
 }
 
 static void
 usage(void)
 {
 	xo_error("%s%s",
 		"usage: vmstat [-afHhimoPsz] [-M core [-N system]] [-c count] [-n devs]\n",
 		"              [-p type,if,pass] [-w wait] [disks] [wait [count]]\n");
 	xo_finish();
 	exit(1);
 }
Index: projects/bsd_rdma_4_9
===================================================================
--- projects/bsd_rdma_4_9	(revision 326161)
+++ projects/bsd_rdma_4_9	(revision 326162)

Property changes on: projects/bsd_rdma_4_9
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r326132-326161