Index: user/ngie/bsnmp_cleanup/contrib/bsnmp/snmpd/main.c
===================================================================
--- user/ngie/bsnmp_cleanup/contrib/bsnmp/snmpd/main.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/contrib/bsnmp/snmpd/main.c	(revision 298468)
@@ -1,3281 +1,3281 @@
 /*
  * Copyright (c) 2001-2003
  *	Fraunhofer Institute for Open Communication Systems (FhG Fokus).
  *	All rights reserved.
  *
  * Author: Harti Brandt <harti@freebsd.org>
  * 
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Shteryana Sotirova Shopova
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $Begemot: bsnmp/snmpd/main.c,v 1.100 2006/02/14 09:04:20 brandt_h Exp $
  *
  * SNMPd main stuff.
  */
 
 #include <sys/queue.h>
 #include <sys/param.h>
 #include <sys/un.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <string.h>
 #include <stdarg.h>
 #include <ctype.h>
 #include <errno.h>
 #include <syslog.h>
 #include <unistd.h>
 #include <signal.h>
 #include <dlfcn.h>
 #include <inttypes.h>
 
 #ifdef USE_TCPWRAPPERS
 #include <arpa/inet.h>
 #include <tcpd.h>
 #endif
 
 #include "support.h"
 #include "snmpmod.h"
 #include "snmpd.h"
 #include "tree.h"
 #include "oid.h"
 
 #define	PATH_PID	"/var/run/%s.pid"
 #define PATH_CONFIG	"/etc/%s.config"
 #define	PATH_ENGINE	"/var/%s.engine"
 
 uint64_t this_tick;	/* start of processing of current packet (absolute) */
 uint64_t start_tick;	/* start of processing */
 
 struct systemg systemg = {
 	NULL,
 	{ 8, { 1, 3, 6, 1, 4, 1, 1115, 7352 }},
 	NULL, NULL, NULL,
 	64 + 8 + 4,
 	0
 };
 struct debug debug = {
 	0,		/* dump_pdus */
 	LOG_DEBUG,	/* log_pri */
 	0,		/* evdebug */
 };
 
 struct snmpd snmpd = {
 	2048,		/* txbuf */
 	2048,		/* rxbuf */
 	0,		/* comm_dis */
 	0,		/* auth_traps */
 	{0, 0, 0, 0},	/* trap1addr */
 	VERS_ENABLE_ALL,/* version_enable */
 };
 struct snmpd_stats snmpd_stats;
 
 struct snmpd_usmstat snmpd_usmstats;
 
 /* snmpEngine */
 struct snmp_engine snmpd_engine;
 
 /* snmpSerialNo */
 int32_t snmp_serial_no;
 
 struct snmpd_target_stats snmpd_target_stats;
 
 /* search path for config files */
 const char *syspath = PATH_SYSCONFIG;
 
 /* list of all loaded modules */
 struct lmodules lmodules = TAILQ_HEAD_INITIALIZER(lmodules);
 
 /* list of loaded modules during start-up in the order they were loaded */
 static struct lmodules modules_start = TAILQ_HEAD_INITIALIZER(modules_start);
 
 /* list of all known communities */
 struct community_list community_list = TAILQ_HEAD_INITIALIZER(community_list);
 
 /* list of all known USM users */
 static struct usm_userlist usm_userlist = SLIST_HEAD_INITIALIZER(usm_userlist);
 
 /* A list of all VACM users configured, including v1, v2c and v3 */
 static struct vacm_userlist vacm_userlist =
     SLIST_HEAD_INITIALIZER(vacm_userlist);
 
 /* A list of all VACM groups */
 static struct vacm_grouplist vacm_grouplist =
     SLIST_HEAD_INITIALIZER(vacm_grouplist);
 
 static struct vacm_group vacm_default_group = {
 	.groupname = "",
 };
 
 /* The list of configured access entries */
 static struct vacm_accesslist vacm_accesslist =
     TAILQ_HEAD_INITIALIZER(vacm_accesslist);
 
 /* The list of configured views */
 static struct vacm_viewlist vacm_viewlist =
     SLIST_HEAD_INITIALIZER(vacm_viewlist);
 
 /* The list of configured contexts */
 static struct vacm_contextlist vacm_contextlist =
     SLIST_HEAD_INITIALIZER(vacm_contextlist);
 
 /* list of all installed object resources */
 struct objres_list objres_list = TAILQ_HEAD_INITIALIZER(objres_list);
 
 /* community value generator */
 static u_int next_community_index = 1;
 
 /* list of all known ranges */
 struct idrange_list idrange_list = TAILQ_HEAD_INITIALIZER(idrange_list);
 
 /* identifier generator */
 u_int next_idrange = 1;
 
 /* list of all current timers */
 struct timer_list timer_list = LIST_HEAD_INITIALIZER(timer_list);
 
 /* list of file descriptors */
 struct fdesc_list fdesc_list = LIST_HEAD_INITIALIZER(fdesc_list);
 
 /* program arguments */
 static char **progargs;
 static int nprogargs;
 
 /* current community */
 u_int	community;
 static struct community *comm;
 
 /* current USM user */
 struct usm_user *usm_user;
 
 /* file names */
 static char config_file[MAXPATHLEN + 1];
 static char pid_file[MAXPATHLEN + 1];
 char engine_file[MAXPATHLEN + 1];
 
 #ifndef USE_LIBBEGEMOT
 /* event context */
 static evContext evctx;
 #endif
 
 /* signal mask */
 static sigset_t blocked_sigs;
 
 /* signal handling */
 static int work;
 #define	WORK_DOINFO	0x0001
 #define	WORK_RECONFIG	0x0002
 
 /* oids */
 static const struct asn_oid
 	oid_snmpMIB = OIDX_snmpMIB,
 	oid_begemotSnmpd = OIDX_begemotSnmpd,
 	oid_coldStart = OIDX_coldStart,
 	oid_authenticationFailure = OIDX_authenticationFailure;
 
 const struct asn_oid oid_zeroDotZero = { 2, { 0, 0 }};
 
 const struct asn_oid oid_usmUnknownEngineIDs =
 	{ 11, { 1, 3, 6, 1, 6, 3, 15, 1, 1, 4, 0}};
 
 const struct asn_oid oid_usmNotInTimeWindows =
 	{ 11, { 1, 3, 6, 1, 6, 3, 15, 1, 1, 2, 0}};
 
 /* request id generator for traps */
 u_int trap_reqid;
 
 /* help text */
 static const char usgtxt[] = "\
 Begemot simple SNMP daemon. Copyright (c) 2001-2002 Fraunhofer Institute for\n\
 Open Communication Systems (FhG Fokus). All rights reserved.\n\
 Copyright (c) 2010 The FreeBSD Foundation. All rights reserved.\n\
 usage: snmpd [-dh] [-c file] [-D options] [-e file] [-I path]\n\
              [-l prefix] [-m variable=value] [-p file]\n\
 options:\n\
   -d		don't daemonize\n\
   -h		print this info\n\
   -c file	specify configuration file\n\
   -D options	debugging options\n\
   -e file	specify engine id file\n\
   -I path	system include path\n\
   -l prefix	default basename for pid and config file\n\
   -m var=val	define variable\n\
   -p file	specify pid file\n\
 ";
 
 /* hosts_access(3) request */
 #ifdef USE_TCPWRAPPERS
 static struct request_info req;
 #endif
 
 /* transports */
 extern const struct transport_def udp_trans;
 extern const struct transport_def lsock_trans;
 
 struct transport_list transport_list = TAILQ_HEAD_INITIALIZER(transport_list);
 
 /* forward declarations */
 static void snmp_printf_func(const char *fmt, ...);
 static void snmp_error_func(const char *err, ...);
 static void snmp_debug_func(const char *err, ...);
 static void asn_error_func(const struct asn_buf *b, const char *err, ...);
 
 /*
  * Allocate rx/tx buffer. We allocate one byte more for rx.
  */
 void *
 buf_alloc(int tx)
 {
 	void *buf;
 
 	if ((buf = malloc(tx ? snmpd.txbuf : snmpd.rxbuf)) == NULL) {
 		syslog(LOG_CRIT, "cannot allocate buffer");
 		if (tx)
 			snmpd_stats.noTxbuf++;
 		else
 			snmpd_stats.noRxbuf++;
 		return (NULL);
 	}
 	return (buf);
 }
 
 /*
  * Return the buffer size.
  */
 size_t
 buf_size(int tx)
 {
 	return (tx ? snmpd.txbuf : snmpd.rxbuf);
 }
 
 /*
  * Prepare a PDU for output
  */
 void
 snmp_output(struct snmp_pdu *pdu, u_char *sndbuf, size_t *sndlen,
     const char *dest)
 {
 	struct asn_buf resp_b;
 
 	resp_b.asn_ptr = sndbuf;
 	resp_b.asn_len = snmpd.txbuf;
 
 	if (snmp_pdu_encode(pdu, &resp_b) != 0) {
 		syslog(LOG_ERR, "cannot encode message");
 		abort();
 	}
 	if (debug.dump_pdus) {
 		snmp_printf("%s <- ", dest);
 		snmp_pdu_dump(pdu);
 	}
 	*sndlen = (size_t)(resp_b.asn_ptr - sndbuf);
 }
 
 /*
  * Check USM PDU header credentials against local SNMP Engine & users.
  */
 static enum snmp_code
 snmp_pdu_auth_user(struct snmp_pdu *pdu)
 {
 	uint64_t etime;
 	usm_user = NULL;
 
 	/* un-authenticated snmpEngineId discovery */
 	if (pdu->engine.engine_len == 0 && strlen(pdu->user.sec_name) == 0) {
 		pdu->engine.engine_len = snmpd_engine.engine_len;
 		memcpy(pdu->engine.engine_id, snmpd_engine.engine_id,
 		    snmpd_engine.engine_len);
 		pdu->engine.engine_boots = snmpd_engine.engine_boots;
 		pdu->engine.engine_time = snmpd_engine.engine_time;
 		pdu->flags |= SNMP_MSG_AUTODISCOVER;
 		return (SNMP_CODE_OK);
 	}
 
 	if ((usm_user = usm_find_user(pdu->engine.engine_id,
 	    pdu->engine.engine_len, pdu->user.sec_name)) == NULL ||
 	    usm_user->status != 1 /* active */)
 		return (SNMP_CODE_BADUSER);
 
 	if (usm_user->user_engine_len != snmpd_engine.engine_len ||
 	    memcmp(usm_user->user_engine_id, snmpd_engine.engine_id,
 	    snmpd_engine.engine_len) != 0)
 		return (SNMP_CODE_BADENGINE);
 
 	pdu->user.priv_proto = usm_user->suser.priv_proto;
 	memcpy(pdu->user.priv_key, usm_user->suser.priv_key,
 	    sizeof(pdu->user.priv_key));
 
 	/* authenticated snmpEngineId discovery */
 	if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) {
 		etime = (get_ticks() - start_tick)  / 100ULL;
 		if (etime < INT32_MAX)
 			snmpd_engine.engine_time = etime;
 		else {
 			start_tick = get_ticks();
 			set_snmpd_engine();
 			snmpd_engine.engine_time = start_tick;
 		}
 
 		pdu->user.auth_proto = usm_user->suser.auth_proto;
 		memcpy(pdu->user.auth_key, usm_user->suser.auth_key,
 		    sizeof(pdu->user.auth_key));
 
 		if (pdu->engine.engine_boots == 0 &&
 		    pdu->engine.engine_time == 0) {
 		    	pdu->flags |= SNMP_MSG_AUTODISCOVER;
 			return (SNMP_CODE_OK);
 		}
 
 		if (pdu->engine.engine_boots != snmpd_engine.engine_boots ||
 		    abs(pdu->engine.engine_time - snmpd_engine.engine_time) >
 		    SNMP_TIME_WINDOW)
 			return (SNMP_CODE_NOTINTIME);
 	}
 
 	if (((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0 &&
 	    (pdu->flags & SNMP_MSG_AUTH_FLAG) == 0) ||
 	    ((pdu->flags & SNMP_MSG_AUTH_FLAG) == 0 &&
 	    usm_user->suser.auth_proto != SNMP_AUTH_NOAUTH) ||
 	    ((pdu->flags & SNMP_MSG_PRIV_FLAG) == 0 &&
 	    usm_user->suser.priv_proto != SNMP_PRIV_NOPRIV))
 		return (SNMP_CODE_BADSECLEVEL);
 
 	return (SNMP_CODE_OK);
 }
 
 /*
  * Check whether access to each of var bindings in the PDU is allowed based
  * on the user credentials against the configured User groups & VACM views.
  */
 enum snmp_code
 snmp_pdu_auth_access(struct snmp_pdu *pdu, int32_t *ip)
 {
 	const char *uname;
 	int32_t suboid, smodel;
 	uint32_t i;
 	struct vacm_user *vuser;
 	struct vacm_access *acl;
 	struct vacm_context *vacmctx;
 	struct vacm_view *view;
 
 	/*
 	 * At least a default context exists if the snmpd_vacm(3) module is
 	 * running.
 	 */
 	if (SLIST_EMPTY(&vacm_contextlist) ||
 	    (pdu->flags & SNMP_MSG_AUTODISCOVER) != 0)
 		return (SNMP_CODE_OK);
 
 	switch (pdu->version) {
 	case SNMP_V1:
 		if ((uname = comm_string(community)) == NULL)
 			return (SNMP_CODE_FAILED);
 		smodel = SNMP_SECMODEL_SNMPv1;
 		break;
 
 	case SNMP_V2c:
 		if ((uname = comm_string(community)) == NULL)
 			return (SNMP_CODE_FAILED);
 		smodel = SNMP_SECMODEL_SNMPv2c;
 		break;
 
 	case SNMP_V3:
 		uname = pdu->user.sec_name;
 		if ((smodel = pdu->security_model) !=  SNMP_SECMODEL_USM)
 			return (SNMP_CODE_FAILED);
 		/* Compare the PDU context engine id against the agent's */
 		if (pdu->context_engine_len != snmpd_engine.engine_len ||
 		    memcmp(pdu->context_engine, snmpd_engine.engine_id,
 		    snmpd_engine.engine_len) != 0)
 			return (SNMP_CODE_FAILED);
 		break;
 
 	default:
 		abort();
 	}
 
 	SLIST_FOREACH(vuser, &vacm_userlist, vvu)
 		if (strcmp(uname, vuser->secname) == 0 &&
 		    vuser->sec_model == smodel)
 			break;
 
 	if (vuser == NULL || vuser->group == NULL)
 		return (SNMP_CODE_FAILED);
 
 	/* XXX: shteryana - recheck */
 	TAILQ_FOREACH_REVERSE(acl, &vacm_accesslist, vacm_accesslist, vva) {
 		if (acl->group != vuser->group)
 			continue;
 		SLIST_FOREACH(vacmctx, &vacm_contextlist, vcl)
 			if (memcmp(vacmctx->ctxname, acl->ctx_prefix,
 			    acl->ctx_match) == 0)
 				goto match;
 	}
 
 	return (SNMP_CODE_FAILED);
 
 match:
 
 	switch (pdu->type) {
 	case SNMP_PDU_GET:
 	case SNMP_PDU_GETNEXT:
 	case SNMP_PDU_GETBULK:
 		if ((view = acl->read_view) == NULL)
 			return (SNMP_CODE_FAILED);
 		break;
 
 	case SNMP_PDU_SET:
 		if ((view = acl->write_view) == NULL)
 			return (SNMP_CODE_FAILED);
 		break;
 
 	case SNMP_PDU_TRAP:
 	case SNMP_PDU_INFORM:
 	case SNMP_PDU_TRAP2:
 	case SNMP_PDU_REPORT:
 		if ((view = acl->notify_view) == NULL)
 			return (SNMP_CODE_FAILED);
 		break;
 	case SNMP_PDU_RESPONSE:
 		/* NOTREACHED */
 			return (SNMP_CODE_FAILED);
 	default:
 		abort();
 	}
 
 	for (i = 0; i < pdu->nbindings; i++) {
 		/* XXX - view->mask*/
 		suboid = asn_is_suboid(&view->subtree, &pdu->bindings[i].var);
 		if ((!suboid && !view->exclude) || (suboid && view->exclude)) {
 			*ip = i + 1;
 			return (SNMP_CODE_FAILED);
 		}
 	}
 
 	return (SNMP_CODE_OK);
 }
 
 /*
  * SNMP input. Start: decode the PDU, find the user or community.
  */
 enum snmpd_input_err
 snmp_input_start(const u_char *buf, size_t len, const char *source,
     struct snmp_pdu *pdu, int32_t *ip, size_t *pdulen)
 {
 	struct asn_buf b;
 	enum snmp_code code;
 	enum snmpd_input_err ret;
 	int sret;
 
 	/* update uptime */
 	this_tick = get_ticks();
 
 	b.asn_cptr = buf;
 	b.asn_len = len;
 
 	/* look whether we have enough bytes for the entire PDU. */
 	switch (sret = snmp_pdu_snoop(&b)) {
 
 	  case 0:
 		return (SNMPD_INPUT_TRUNC);
 
 	  case -1:
 		snmpd_stats.inASNParseErrs++;
 		return (SNMPD_INPUT_FAILED);
 	}
 	b.asn_len = *pdulen = (size_t)sret;
 
 	memset(pdu, 0, sizeof(*pdu));
 	if ((code = snmp_pdu_decode_header(&b, pdu)) != SNMP_CODE_OK)
 		goto decoded;
 
 	if (pdu->version == SNMP_V3) {
 		if (pdu->security_model != SNMP_SECMODEL_USM) {
 			code = SNMP_CODE_FAILED;
 			goto decoded;
 		}
 		if ((code = snmp_pdu_auth_user(pdu)) != SNMP_CODE_OK)
 		    	goto decoded;
 		if ((code =  snmp_pdu_decode_secmode(&b, pdu)) != SNMP_CODE_OK)
 			goto decoded;
 	}
 	code = snmp_pdu_decode_scoped(&b, pdu, ip);
 
 	ret = SNMPD_INPUT_OK;
 
 decoded:
 	snmpd_stats.inPkts++;
 
 	switch (code) {
 
 	  case SNMP_CODE_FAILED:
 		snmpd_stats.inASNParseErrs++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_BADVERS:
 	  bad_vers:
 		snmpd_stats.inBadVersions++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_BADLEN:
 		if (pdu->type == SNMP_OP_SET)
 			ret = SNMPD_INPUT_VALBADLEN;
 		break;
 
 	  case SNMP_CODE_OORANGE:
 		if (pdu->type == SNMP_OP_SET)
 			ret = SNMPD_INPUT_VALRANGE;
 		break;
 
 	  case SNMP_CODE_BADENC:
 		if (pdu->type == SNMP_OP_SET)
 			ret = SNMPD_INPUT_VALBADENC;
 		break;
 
 	  case SNMP_CODE_BADSECLEVEL:
 		snmpd_usmstats.unsupported_seclevels++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_NOTINTIME:
 		snmpd_usmstats.not_in_time_windows++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_BADUSER:
 		snmpd_usmstats.unknown_users++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_BADENGINE:
 		snmpd_usmstats.unknown_engine_ids++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_BADDIGEST:
 		snmpd_usmstats.wrong_digests++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_EDECRYPT:
 		snmpd_usmstats.decrypt_errors++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_CODE_OK:
 		switch (pdu->version) {
 
 		  case SNMP_V1:
 			if (!(snmpd.version_enable & VERS_ENABLE_V1))
 				goto bad_vers;
 			break;
 
 		  case SNMP_V2c:
 			if (!(snmpd.version_enable & VERS_ENABLE_V2C))
 				goto bad_vers;
 			break;
 
 		  case SNMP_V3:
 		  	if (!(snmpd.version_enable & VERS_ENABLE_V3))
 				goto bad_vers;
 			break;
 
 		  case SNMP_Verr:
 			goto bad_vers;
 		}
 		break;
 	}
 
 	if (debug.dump_pdus) {
 		snmp_printf("%s -> ", source);
 		snmp_pdu_dump(pdu);
 	}
 
 	/*
 	 * Look, whether we know the community or user
 	 */
 
 	if (pdu->version != SNMP_V3) {
 		TAILQ_FOREACH(comm, &community_list, link)
 			if (comm->string != NULL &&
 			    strcmp(comm->string, pdu->community) == 0)
 				break;
 
 		if (comm == NULL) {
 			snmpd_stats.inBadCommunityNames++;
 			snmp_pdu_free(pdu);
 			if (snmpd.auth_traps)
 				snmp_send_trap(&oid_authenticationFailure,
 				    (struct snmp_value *)NULL);
 			ret = SNMPD_INPUT_BAD_COMM;
 		} else
 			community = comm->value;
 	} else if (pdu->nbindings == 0) {
 		/* RFC 3414 - snmpEngineID Discovery */
 		if (strlen(pdu->user.sec_name) == 0) {
 			asn_append_oid(&(pdu->bindings[pdu->nbindings++].var),
 			    &oid_usmUnknownEngineIDs);
 			pdu->context_engine_len = snmpd_engine.engine_len;
 			memcpy(pdu->context_engine, snmpd_engine.engine_id,
 			    snmpd_engine.engine_len);
 		} else if (pdu->engine.engine_boots == 0 &&
 		    pdu->engine.engine_time == 0) {
 			asn_append_oid(&(pdu->bindings[pdu->nbindings++].var),
 			    &oid_usmNotInTimeWindows);
 			pdu->engine.engine_boots = snmpd_engine.engine_boots;
 			pdu->engine.engine_time = snmpd_engine.engine_time;
 		}
 	} else if (usm_user->suser.auth_proto != SNMP_AUTH_NOAUTH &&
 	     (pdu->engine.engine_boots == 0 || pdu->engine.engine_time == 0)) {
 		snmpd_usmstats.not_in_time_windows++;
 		ret = SNMPD_INPUT_FAILED;
 	}
 
 	if ((code = snmp_pdu_auth_access(pdu, ip)) != SNMP_CODE_OK)
 		ret = SNMPD_INPUT_FAILED;
 
 	return (ret);
 }
 
 /*
  * Will return only _OK or _FAILED
  */
 enum snmpd_input_err
 snmp_input_finish(struct snmp_pdu *pdu, const u_char *rcvbuf, size_t rcvlen,
     u_char *sndbuf, size_t *sndlen, const char *source,
     enum snmpd_input_err ierr, int32_t ivar, void *data)
 {
 	struct snmp_pdu resp;
 	struct asn_buf resp_b, pdu_b;
 	enum snmp_ret ret;
 
 	resp_b.asn_ptr = sndbuf;
 	resp_b.asn_len = snmpd.txbuf;
 
 	pdu_b.asn_cptr = rcvbuf;
 	pdu_b.asn_len = rcvlen;
 
 	if (ierr != SNMPD_INPUT_OK) {
 		/* error decoding the input of a SET */
 		if (pdu->version == SNMP_V1)
 			pdu->error_status = SNMP_ERR_BADVALUE;
 		else if (ierr == SNMPD_INPUT_VALBADLEN)
 			pdu->error_status = SNMP_ERR_WRONG_LENGTH;
 		else if (ierr == SNMPD_INPUT_VALRANGE)
 			pdu->error_status = SNMP_ERR_WRONG_VALUE;
 		else
 			pdu->error_status = SNMP_ERR_WRONG_ENCODING;
 
 		pdu->error_index = ivar;
 
 		if (snmp_make_errresp(pdu, &pdu_b, &resp_b) == SNMP_RET_IGN) {
 			syslog(LOG_WARNING, "could not encode error response");
 			snmpd_stats.silentDrops++;
 			return (SNMPD_INPUT_FAILED);
 		}
 
 		if (debug.dump_pdus) {
 			snmp_printf("%s <- ", source);
 			snmp_pdu_dump(pdu);
 		}
 		*sndlen = (size_t)(resp_b.asn_ptr - sndbuf);
 		return (SNMPD_INPUT_OK);
 	}
 
 	switch (pdu->type) {
 
 	  case SNMP_PDU_GET:
 		ret = snmp_get(pdu, &resp_b, &resp, data);
 		break;
 
 	  case SNMP_PDU_GETNEXT:
 		ret = snmp_getnext(pdu, &resp_b, &resp, data);
 		break;
 
 	  case SNMP_PDU_SET:
 		ret = snmp_set(pdu, &resp_b, &resp, data);
 		break;
 
 	  case SNMP_PDU_GETBULK:
 		ret = snmp_getbulk(pdu, &resp_b, &resp, data);
 		break;
 
 	  default:
 		ret = SNMP_RET_IGN;
 		break;
 	}
 
 	switch (ret) {
 
 	  case SNMP_RET_OK:
 		/* normal return - send a response */
 		if (debug.dump_pdus) {
 			snmp_printf("%s <- ", source);
 			snmp_pdu_dump(&resp);
 		}
 		*sndlen = (size_t)(resp_b.asn_ptr - sndbuf);
 		snmp_pdu_free(&resp);
 		return (SNMPD_INPUT_OK);
 
 	  case SNMP_RET_IGN:
 		/* error - send nothing */
 		snmpd_stats.silentDrops++;
 		return (SNMPD_INPUT_FAILED);
 
 	  case SNMP_RET_ERR:
 		/* error - send error response. The snmp routine has
 		 * changed the error fields in the original message. */
 		resp_b.asn_ptr = sndbuf;
 		resp_b.asn_len = snmpd.txbuf;
 		if (snmp_make_errresp(pdu, &pdu_b, &resp_b) == SNMP_RET_IGN) {
 			syslog(LOG_WARNING, "could not encode error response");
 			snmpd_stats.silentDrops++;
 			return (SNMPD_INPUT_FAILED);
 		} else {
 			if (debug.dump_pdus) {
 				snmp_printf("%s <- ", source);
 				snmp_pdu_dump(pdu);
 			}
 			*sndlen = (size_t)(resp_b.asn_ptr - sndbuf);
 			return (SNMPD_INPUT_OK);
 		}
 	}
 	abort();
 }
 
 /*
  * Insert a port into the right place in the transport's table of ports
  */
 void
 trans_insert_port(struct transport *t, struct tport *port)
 {
 	struct tport *p;
 
 	TAILQ_FOREACH(p, &t->table, link) {
 		if (asn_compare_oid(&p->index, &port->index) > 0) {
 			TAILQ_INSERT_BEFORE(p, port, link);
 			return;
 		}
 	}
 	port->transport = t;
 	TAILQ_INSERT_TAIL(&t->table, port, link);
 }
 
 /*
  * Remove a port from a transport's list
  */
 void
 trans_remove_port(struct tport *port)
 {
 
 	TAILQ_REMOVE(&port->transport->table, port, link);
 }
 
 /*
  * Find a port on a transport's list
  */
 struct tport *
 trans_find_port(struct transport *t, const struct asn_oid *idx, u_int sub)
 {
 
 	return (FIND_OBJECT_OID(&t->table, idx, sub));
 }
 
 /*
  * Find next port on a transport's list
  */
 struct tport *
 trans_next_port(struct transport *t, const struct asn_oid *idx, u_int sub)
 {
 
 	return (NEXT_OBJECT_OID(&t->table, idx, sub));
 }
 
 /*
  * Return first port
  */
 struct tport *
 trans_first_port(struct transport *t)
 {
 
 	return (TAILQ_FIRST(&t->table));
 }
 
 /*
  * Iterate through all ports until a function returns a 0.
  */
 struct tport *
 trans_iter_port(struct transport *t, int (*func)(struct tport *, intptr_t),
     intptr_t arg)
 {
 	struct tport *p;
 
 	TAILQ_FOREACH(p, &t->table, link)
 		if (func(p, arg) == 0)
 			return (p);
 	return (NULL);
 }
 
 /*
  * Register a transport
  */
 int
 trans_register(const struct transport_def *def, struct transport **pp)
 {
 	u_int i;
 	char or_descr[256];
 
 	if ((*pp = malloc(sizeof(**pp))) == NULL)
 		return (SNMP_ERR_GENERR);
 
 	/* construct index */
 	(*pp)->index.len = strlen(def->name) + 1;
 	(*pp)->index.subs[0] = strlen(def->name);
 	for (i = 0; i < (*pp)->index.subs[0]; i++)
 		(*pp)->index.subs[i + 1] = def->name[i];
 
 	(*pp)->vtab = def;
 
 	if (FIND_OBJECT_OID(&transport_list, &(*pp)->index, 0) != NULL) {
 		free(*pp);
 		return (SNMP_ERR_INCONS_VALUE);
 	}
 
 	/* register module */
 	snprintf(or_descr, sizeof(or_descr), "%s transport mapping", def->name);
 	if (((*pp)->or_index = or_register(&def->id, or_descr, NULL)) == 0) {
 		free(*pp);
 		return (SNMP_ERR_GENERR);
 	}
 
 	INSERT_OBJECT_OID((*pp), &transport_list);
 
 	TAILQ_INIT(&(*pp)->table);
 
 	return (SNMP_ERR_NOERROR);
 }
 
 /*
  * Unregister transport
  */
 int
 trans_unregister(struct transport *t)
 {
 	if (!TAILQ_EMPTY(&t->table))
 		return (SNMP_ERR_INCONS_VALUE);
 
 	or_unregister(t->or_index);
 	TAILQ_REMOVE(&transport_list, t, link);
 
 	return (SNMP_ERR_NOERROR);
 }
 
 /*
  * File descriptor support
  */
 #ifdef USE_LIBBEGEMOT
 static void
 input(int fd, int mask __unused, void *uap)
 #else
 static void
 input(evContext ctx __unused, void *uap, int fd, int mask __unused)
 #endif
 {
 	struct fdesc *f = uap;
 
 	(*f->func)(fd, f->udata);
 }
 
 void
 fd_suspend(void *p)
 {
 	struct fdesc *f = p;
 
 #ifdef USE_LIBBEGEMOT
 	if (f->id >= 0) {
 		poll_unregister(f->id);
 		f->id = -1;
 	}
 #else
 	if (evTestID(f->id)) {
 		(void)evDeselectFD(evctx, f->id);
 		evInitID(&f->id);
 	}
 #endif
 }
 
 int
 fd_resume(void *p)
 {
 	struct fdesc *f = p;
 	int err;
 
 #ifdef USE_LIBBEGEMOT
 	if (f->id >= 0)
 		return (0);
 	if ((f->id = poll_register(f->fd, input, f, POLL_IN)) < 0) {
 		err = errno;
 		syslog(LOG_ERR, "select fd %d: %m", f->fd);
 		errno = err;
 		return (-1);
 	}
 #else
 	if (evTestID(f->id))
 		return (0);
 	if (evSelectFD(evctx, f->fd, EV_READ, input, f, &f->id)) {
 		err = errno;
 		syslog(LOG_ERR, "select fd %d: %m", f->fd);
 		errno = err;
 		return (-1);
 	}
 #endif
 	return (0);
 }
 
 void *
 fd_select(int fd, void (*func)(int, void *), void *udata, struct lmodule *mod)
 {
 	struct fdesc *f;
 	int err;
 
 	if ((f = malloc(sizeof(struct fdesc))) == NULL) {
 		err = errno;
 		syslog(LOG_ERR, "fd_select: %m");
 		errno = err;
 		return (NULL);
 	}
 	f->fd = fd;
 	f->func = func;
 	f->udata = udata;
 	f->owner = mod;
 #ifdef USE_LIBBEGEMOT
 	f->id = -1;
 #else
 	evInitID(&f->id);
 #endif
 
 	if (fd_resume(f)) {
 		err = errno;
 		free(f);
 		errno = err;
 		return (NULL);
 	}
 
 	LIST_INSERT_HEAD(&fdesc_list, f, link);
 
 	return (f);
 }
 
 void
 fd_deselect(void *p)
 {
 	struct fdesc *f = p;
 
 	LIST_REMOVE(f, link);
 	fd_suspend(f);
 	free(f);
 }
 
 static void
 fd_flush(struct lmodule *mod)
 {
 	struct fdesc *t, *t1;
 
 	t = LIST_FIRST(&fdesc_list);
 	while (t != NULL) {
 		t1 = LIST_NEXT(t, link);
 		if (t->owner == mod)
 			fd_deselect(t);
 		t = t1;
 	}
 }
 
 /*
  * Consume a message from the input buffer
  */
 static void
 snmp_input_consume(struct port_input *pi)
 {
 	if (!pi->stream) {
 		/* always consume everything */
 		pi->length = 0;
 		return;
 	}
 	if (pi->consumed >= pi->length) {
 		/* all bytes consumed */
 		pi->length = 0;
 		return;
 	}
 	memmove(pi->buf, pi->buf + pi->consumed, pi->length - pi->consumed);
 	pi->length -= pi->consumed;
 }
 
 static void
 check_priv_dgram(struct port_input *pi, struct sockcred *cred)
 {
 
 	/* process explicitly sends credentials */
 	if (cred)
 		pi->priv = (cred->sc_euid == 0);
 	else
 		pi->priv = 0;
 }
 
 static void
 check_priv_stream(struct port_input *pi)
 {
 	struct xucred ucred;
 	socklen_t ucredlen;
 
 	/* obtain the accept time credentials */
 	ucredlen = sizeof(ucred);
 
 	if (getsockopt(pi->fd, 0, LOCAL_PEERCRED, &ucred, &ucredlen) == 0 &&
 	    ucredlen >= sizeof(ucred) && ucred.cr_version == XUCRED_VERSION)
 		pi->priv = (ucred.cr_uid == 0);
 	else
 		pi->priv = 0;
 }
 
 /*
  * Input from a stream socket.
  */
 static int
 recv_stream(struct port_input *pi)
 {
 	struct msghdr msg;
 	struct iovec iov[1];
 	ssize_t len;
 
 	if (pi->buf == NULL) {
 		/* no buffer yet - allocate one */
 		if ((pi->buf = buf_alloc(0)) == NULL) {
 			/* ups - could not get buffer. Return an error
 			 * the caller must close the transport. */
 			return (-1);
 		}
 		pi->buflen = buf_size(0);
 		pi->consumed = 0;
 		pi->length = 0;
 	}
 
 	/* try to get a message */
 	msg.msg_name = pi->peer;
 	msg.msg_namelen = pi->peerlen;
 	msg.msg_iov = iov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
 
 	iov[0].iov_base = pi->buf + pi->length;
 	iov[0].iov_len = pi->buflen - pi->length;
 
 	len = recvmsg(pi->fd, &msg, 0);
 
 	if (len == -1 || len == 0)
 		/* receive error */
 		return (-1);
 
 	pi->length += len;
 
 	if (pi->cred)
 		check_priv_stream(pi);
 
 	return (0);
 }
 
 /*
  * Input from a datagram socket.
  * Each receive should return one datagram.
  */
 static int
 recv_dgram(struct port_input *pi, struct in_addr *laddr)
 {
 	u_char embuf[1000];
 	char cbuf[CMSG_SPACE(SOCKCREDSIZE(CMGROUP_MAX)) +
 	    CMSG_SPACE(sizeof(struct in_addr))];
 	struct msghdr msg;
 	struct iovec iov[1];
 	ssize_t len;
 	struct cmsghdr *cmsg;
 	struct sockcred *cred = NULL;
 
 	if (pi->buf == NULL) {
 		/* no buffer yet - allocate one */
 		if ((pi->buf = buf_alloc(0)) == NULL) {
 			/* ups - could not get buffer. Read away input
 			 * and drop it */
 			(void)recvfrom(pi->fd, embuf, sizeof(embuf),
 			    0, NULL, NULL);
 			/* return error */
 			return (-1);
 		}
 		pi->buflen = buf_size(0);
 	}
 
 	/* try to get a message */
 	msg.msg_name = pi->peer;
 	msg.msg_namelen = pi->peerlen;
 	msg.msg_iov = iov;
 	msg.msg_iovlen = 1;
 	memset(cbuf, 0, sizeof(cbuf));
 	msg.msg_control = cbuf;
 	msg.msg_controllen = sizeof(cbuf);
 	msg.msg_flags = 0;
 
 	iov[0].iov_base = pi->buf;
 	iov[0].iov_len = pi->buflen;
 
 	len = recvmsg(pi->fd, &msg, 0);
 
 	if (len == -1 || len == 0)
 		/* receive error */
 		return (-1);
 
 	if (msg.msg_flags & MSG_TRUNC) {
 		/* truncated - drop */
 		snmpd_stats.silentDrops++;
 		snmpd_stats.inTooLong++;
 		return (-1);
 	}
 
 	pi->length = (size_t)len;
 
 	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
 	    cmsg = CMSG_NXTHDR(&msg, cmsg)) {
 		if (cmsg->cmsg_level == IPPROTO_IP &&
 		    cmsg->cmsg_type == IP_RECVDSTADDR)
 			memcpy(laddr, CMSG_DATA(cmsg), sizeof(struct in_addr));
 		if (cmsg->cmsg_level == SOL_SOCKET &&
 		    cmsg->cmsg_type == SCM_CREDS)
 			memcpy(cred, CMSG_DATA(cmsg), sizeof(struct sockcred));
 	}
 
 	if (pi->cred)
 		check_priv_dgram(pi, cred);
 
 	return (0);
 }
 
 /*
  * Input from a socket
  */
 int
 snmpd_input(struct port_input *pi, struct tport *tport)
 {
 	u_char *sndbuf;
 	size_t sndlen;
 	struct snmp_pdu pdu;
 	enum snmpd_input_err ierr, ferr;
 	enum snmpd_proxy_err perr;
 	int32_t vi;
 	int ret;
 	ssize_t slen;
 #ifdef USE_TCPWRAPPERS
 	char client[16];
 #endif
 	struct msghdr msg;
 	struct iovec iov[1];
 	char cbuf[CMSG_SPACE(sizeof(struct in_addr))];
 	struct cmsghdr *cmsgp;
 
 	/* get input depending on the transport */
 	if (pi->stream) {
 		msg.msg_control = NULL;
 		msg.msg_controllen = 0;
 
 		ret = recv_stream(pi);
 	} else {
 		struct in_addr laddr;
 
 		memset(cbuf, 0, CMSG_SPACE(sizeof(struct in_addr)));
 		msg.msg_control = cbuf;
 		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr));
 		cmsgp = CMSG_FIRSTHDR(&msg);
 		cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in_addr));
 		cmsgp->cmsg_level = IPPROTO_IP;
 		cmsgp->cmsg_type = IP_SENDSRCADDR;
 		memcpy(&laddr, CMSG_DATA(cmsgp), sizeof(struct in_addr));
 		
 		ret = recv_dgram(pi, &laddr);
 
 		if (laddr.s_addr == 0) {
 			msg.msg_control = NULL;
 			msg.msg_controllen = 0;
 		}
 	}
 
 	if (ret == -1)
 		return (-1);
 
 #ifdef USE_TCPWRAPPERS
 	/*
 	 * In case of AF_INET{6} peer, do hosts_access(5) check.
 	 */
 	if (pi->peer->sa_family != AF_LOCAL &&
 	    inet_ntop(pi->peer->sa_family,
 	    &((const struct sockaddr_in *)(const void *)pi->peer)->sin_addr,
 	    client, sizeof(client)) != NULL) {
 		request_set(&req, RQ_CLIENT_ADDR, client, 0);
 		if (hosts_access(&req) == 0) {
 			syslog(LOG_ERR, "refused connection from %.500s",
 			    eval_client(&req));
 			return (-1);
 		}
 	} else if (pi->peer->sa_family != AF_LOCAL)
 		syslog(LOG_ERR, "inet_ntop(): %m");
 #endif
 
 	/*
 	 * Handle input
 	 */
 	ierr = snmp_input_start(pi->buf, pi->length, "SNMP", &pdu, &vi,
 	    &pi->consumed);
 	if (ierr == SNMPD_INPUT_TRUNC) {
 		/* need more bytes. This is ok only for streaming transports.
 		 * but only if we have not reached bufsiz yet. */
 		if (pi->stream) {
 			if (pi->length == buf_size(0)) {
 				snmpd_stats.silentDrops++;
 				return (-1);
 			}
 			return (0);
 		}
 		snmpd_stats.silentDrops++;
 		return (-1);
 	}
 
 	/* can't check for bad SET pdus here, because a proxy may have to
 	 * check the access first. We don't want to return an error response
 	 * to a proxy PDU with a wrong community */
 	if (ierr == SNMPD_INPUT_FAILED) {
 		/* for streaming transports this is fatal */
 		if (pi->stream)
 			return (-1);
 		snmp_input_consume(pi);
 		return (0);
 	}
 	if (ierr == SNMPD_INPUT_BAD_COMM) {
 		snmp_input_consume(pi);
 		return (0);
 	}
 
 	/*
 	 * If that is a module community and the module has a proxy function,
 	 * the hand it over to the module.
 	 */
 	if (comm != NULL && comm->owner != NULL &&
 	    comm->owner->config->proxy != NULL) {
 		perr = (*comm->owner->config->proxy)(&pdu, tport->transport,
 		    &tport->index, pi->peer, pi->peerlen, ierr, vi,
 		    !pi->cred || pi->priv);
 
 		switch (perr) {
 
 		  case SNMPD_PROXY_OK:
 			snmp_input_consume(pi);
 			return (0);
 
 		  case SNMPD_PROXY_REJ:
 			break;
 
 		  case SNMPD_PROXY_DROP:
 			snmp_input_consume(pi);
 			snmp_pdu_free(&pdu);
 			snmpd_stats.proxyDrops++;
 			return (0);
 
 		  case SNMPD_PROXY_BADCOMM:
 			snmp_input_consume(pi);
 			snmp_pdu_free(&pdu);
 			snmpd_stats.inBadCommunityNames++;
 			if (snmpd.auth_traps)
 				snmp_send_trap(&oid_authenticationFailure,
 				    (struct snmp_value *)NULL);
 			return (0);
 
 		  case SNMPD_PROXY_BADCOMMUSE:
 			snmp_input_consume(pi);
 			snmp_pdu_free(&pdu);
 			snmpd_stats.inBadCommunityUses++;
 			if (snmpd.auth_traps)
 				snmp_send_trap(&oid_authenticationFailure,
 				    (struct snmp_value *)NULL);
 			return (0);
 		}
 	}
 
 	/*
 	 * Check type
 	 */
 	if (pdu.type == SNMP_PDU_RESPONSE ||
 	    pdu.type == SNMP_PDU_TRAP ||
 	    pdu.type == SNMP_PDU_TRAP2) {
 		snmpd_stats.silentDrops++;
 		snmpd_stats.inBadPduTypes++;
 		snmp_pdu_free(&pdu);
 		snmp_input_consume(pi);
 		return (0);
 	}
 
 	/*
 	 * Check community
 	 */
 	if (pdu.version < SNMP_V3 &&
 	    ((pi->cred && !pi->priv && pdu.type == SNMP_PDU_SET) ||
 	    (community != COMM_WRITE &&
             (pdu.type == SNMP_PDU_SET || community != COMM_READ)))) {
 		snmpd_stats.inBadCommunityUses++;
 		snmp_pdu_free(&pdu);
 		snmp_input_consume(pi);
 		if (snmpd.auth_traps)
 			snmp_send_trap(&oid_authenticationFailure,
 			    (struct snmp_value *)NULL);
 		return (0);
 	}
 
 	/*
 	 * Execute it.
 	 */
 	if ((sndbuf = buf_alloc(1)) == NULL) {
 		snmpd_stats.silentDrops++;
 		snmp_pdu_free(&pdu);
 		snmp_input_consume(pi);
 		return (0);
 	}
 	ferr = snmp_input_finish(&pdu, pi->buf, pi->length,
 	    sndbuf, &sndlen, "SNMP", ierr, vi, NULL);
 
 	if (ferr == SNMPD_INPUT_OK) {
 		msg.msg_name = pi->peer;
 		msg.msg_namelen = pi->peerlen;
 		msg.msg_iov = iov;
 		msg.msg_iovlen = 1;
 		msg.msg_flags = 0;
 		iov[0].iov_base = sndbuf;
 		iov[0].iov_len = sndlen;
 
 		slen = sendmsg(pi->fd, &msg, 0);
 		if (slen == -1)
 			syslog(LOG_ERR, "sendmsg: %m");
 		else if ((size_t)slen != sndlen)
 			syslog(LOG_ERR, "sendmsg: short write %zu/%zu",
 			    sndlen, (size_t)slen);
 	}
 	snmp_pdu_free(&pdu);
 	free(sndbuf);
 	snmp_input_consume(pi);
 
 	return (0);
 }
 
 /*
  * Send a PDU to a given port
  */
 void
 snmp_send_port(void *targ, const struct asn_oid *port, struct snmp_pdu *pdu,
     const struct sockaddr *addr, socklen_t addrlen)
 {
 	struct transport *trans = targ;
 	struct tport *tp;
 	u_char *sndbuf;
 	size_t sndlen;
 	ssize_t len;
 
 	TAILQ_FOREACH(tp, &trans->table, link)
 		if (asn_compare_oid(port, &tp->index) == 0)
 			break;
 	if (tp == 0)
 		return;
 
 	if ((sndbuf = buf_alloc(1)) == NULL)
 		return;
 
 	snmp_output(pdu, sndbuf, &sndlen, "SNMP PROXY");
 
 	len = trans->vtab->send(tp, sndbuf, sndlen, addr, addrlen);
 
 	if (len == -1)
 		syslog(LOG_ERR, "sendto: %m");
 	else if ((size_t)len != sndlen)
 		syslog(LOG_ERR, "sendto: short write %zu/%zu",
 		    sndlen, (size_t)len);
 
 	free(sndbuf);
 }
 
 
 /*
  * Close an input source
  */
 void
 snmpd_input_close(struct port_input *pi)
 {
 	if (pi->id != NULL)
 		fd_deselect(pi->id);
 	if (pi->fd >= 0)
 		(void)close(pi->fd);
 	if (pi->buf != NULL)
 		free(pi->buf);
 }
 
 /*
  * Dump internal state.
  */
 #ifdef USE_LIBBEGEMOT
 static void
 info_func(void)
 #else
 static void
 info_func(evContext ctx __unused, void *uap __unused, const void *tag __unused)
 #endif
 {
 	struct lmodule *m;
 	u_int i;
 	char buf[10000];
 
 	syslog(LOG_DEBUG, "Dump of SNMPd %lu\n", (u_long)getpid());
 	for (i = 0; i < tree_size; i++) {
 		switch (tree[i].type) {
 
 		  case SNMP_NODE_LEAF:
 			sprintf(buf, "LEAF: %s %s", tree[i].name,
 			    asn_oid2str(&tree[i].oid));
 			break;
 
 		  case SNMP_NODE_COLUMN:
 			sprintf(buf, "COL: %s %s", tree[i].name,
 			    asn_oid2str(&tree[i].oid));
 			break;
 		}
 		syslog(LOG_DEBUG, "%s", buf);
 	}
 
 	TAILQ_FOREACH(m, &lmodules, link)
 		if (m->config->dump)
 			(*m->config->dump)();
 }
 
 /*
  * Re-read configuration
  */
 #ifdef USE_LIBBEGEMOT
 static void
 config_func(void)
 #else
 static void
 config_func(evContext ctx __unused, void *uap __unused,
     const void *tag __unused)
 #endif
 {
 	struct lmodule *m;
 
 	if (read_config(config_file, NULL)) {
 		syslog(LOG_ERR, "error reading config file '%s'", config_file);
 		return;
 	}
 	TAILQ_FOREACH(m, &lmodules, link)
 		if (m->config->config)
 			(*m->config->config)();
 }
 
 /*
  * On USR1 dump actual configuration.
  */
 static void
 onusr1(int s __unused)
 {
 
 	work |= WORK_DOINFO;
 }
 static void
 onhup(int s __unused)
 {
 
 	work |= WORK_RECONFIG;
 }
 
 static void
 onterm(int s __unused)
 {
 
 	/* allow clean-up */
 	exit(0);
 }
 
 static void
 init_sigs(void)
 {
 	struct sigaction sa;
 
 	sa.sa_handler = onusr1;
 	sa.sa_flags = SA_RESTART;
 	sigemptyset(&sa.sa_mask);
 	if (sigaction(SIGUSR1, &sa, NULL)) {
 		syslog(LOG_ERR, "sigaction: %m");
 		exit(1);
 	}
 
 	sa.sa_handler = onhup;
 	if (sigaction(SIGHUP, &sa, NULL)) {
 		syslog(LOG_ERR, "sigaction: %m");
 		exit(1);
 	}
 
 	sa.sa_handler = onterm;
 	sa.sa_flags = 0;
 	sigemptyset(&sa.sa_mask);
 	if (sigaction(SIGTERM, &sa, NULL)) {
 		syslog(LOG_ERR, "sigaction: %m");
 		exit(1);
 	}
 	if (sigaction(SIGINT, &sa, NULL)) {
 		syslog(LOG_ERR, "sigaction: %m");
 		exit(1);
 	}
 }
 
 static void
 block_sigs(void)
 {
 	sigset_t set;
 
 	sigfillset(&set);
 	if (sigprocmask(SIG_BLOCK, &set, &blocked_sigs) == -1) {
 		syslog(LOG_ERR, "SIG_BLOCK: %m");
 		exit(1);
 	}
 }
 static void
 unblock_sigs(void)
 {
 	if (sigprocmask(SIG_SETMASK, &blocked_sigs, NULL) == -1) {
 		syslog(LOG_ERR, "SIG_SETMASK: %m");
 		exit(1);
 	}
 }
 
 /*
  * Shut down
  */
 static void
 term(void)
 {
 	(void)unlink(pid_file);
 }
 
 static void
 trans_stop(void)
 {
 	struct transport *t;
 
 	TAILQ_FOREACH(t, &transport_list, link)
 		(void)t->vtab->stop(1);
 }
 
 /*
  * Define a macro from the command line
  */
 static void
 do_macro(char *arg)
 {
 	char *eq;
 	int err;
 
 	if ((eq = strchr(arg, '=')) == NULL)
 		err = define_macro(arg, "");
 	else {
 		*eq++ = '\0';
 		err = define_macro(arg, eq);
 	}
 	if (err == -1) {
 		syslog(LOG_ERR, "cannot save macro: %m");
 		exit(1);
 	}
 }
 
 /*
  * Re-implement getsubopt from scratch, because the second argument is broken
  * and will not compile with WARNS=5.
  */
 static int
 getsubopt1(char **arg, const char *const *options, char **valp, char **optp)
 {
 	static const char *const delim = ",\t ";
 	u_int i;
 	char *ptr;
 
 	*optp = NULL;
 
 	/* skip leading junk */
 	for (ptr = *arg; *ptr != '\0'; ptr++)
 		if (strchr(delim, *ptr) == NULL)
 			break;
 	if (*ptr == '\0') {
 		*arg = ptr;
 		return (-1);
 	}
 	*optp = ptr;
 
 	/* find the end of the option */
 	while (*++ptr != '\0')
 		if (strchr(delim, *ptr) != NULL || *ptr == '=')
 			break;
 
 	if (*ptr != '\0') {
 		if (*ptr == '=') {
 			*ptr++ = '\0';
 			*valp = ptr;
 			while (*ptr != '\0' && strchr(delim, *ptr) == NULL)
 				ptr++;
 			if (*ptr != '\0')
 				*ptr++ = '\0';
 		} else
 			*ptr++ = '\0';
 	}
 
 	*arg = ptr;
 
 	for (i = 0; *options != NULL; options++, i++)
 		if (strcmp(*optp, *options) == 0)
 			return (i);
 	return (-1);
 }
 
 int
 main(int argc, char *argv[])
 {
 	int opt;
 	FILE *fp;
 	int background = 1;
 	struct tport *p;
 	const char *prefix = "snmpd";
 	struct lmodule *m;
 	char *value = NULL, *option; /* XXX */
 	struct transport *t;
 
 #define DBG_DUMP	0
 #define DBG_EVENTS	1
 #define DBG_TRACE	2
 	static const char *const debug_opts[] = {
 		"dump",
 		"events",
 		"trace",
 		NULL
 	};
 
 	snmp_printf = snmp_printf_func;
 	snmp_error = snmp_error_func;
 	snmp_debug = snmp_debug_func;
 	asn_error = asn_error_func;
 
 	while ((opt = getopt(argc, argv, "c:dD:e:hI:l:m:p:")) != EOF)
 		switch (opt) {
 
 		  case 'c':
 			strlcpy(config_file, optarg, sizeof(config_file));
 			break;
 
 		  case 'd':
 			background = 0;
 			break;
 
 		  case 'D':
 			while (*optarg) {
 				switch (getsubopt1(&optarg, debug_opts,
 				    &value, &option)) {
 
 				  case DBG_DUMP:
 					debug.dump_pdus = 1;
 					break;
 
 				  case DBG_EVENTS:
 					debug.evdebug++;
 					break;
 
 				  case DBG_TRACE:
 					if (value == NULL)
 						syslog(LOG_ERR,
 						    "no value for 'trace'");
 					else
 						snmp_trace = strtoul(value,
 						    NULL, 0);
 					break;
 
 				  case -1:
 					if (suboptarg)
 						syslog(LOG_ERR,
 						    "unknown debug flag '%s'",
 						    option);
 					else
 						syslog(LOG_ERR,
 						    "missing debug flag");
 					break;
 				}
 			}
 			break;
 
 		  case 'e':
 			strlcpy(engine_file, optarg, sizeof(engine_file));
 			break;
 		  case 'h':
 			fprintf(stderr, "%s", usgtxt);
 			exit(0);
 
 		  case 'I':
 			syspath = optarg;
 			break;
 
 		  case 'l':
 			prefix = optarg;
 			break;
 
 		  case 'm':
 			do_macro(optarg);
 			break;
 
 		  case 'p':
 			strlcpy(pid_file, optarg, sizeof(pid_file));
 			break;
 		}
 
 	openlog(prefix, LOG_PID | (background ? 0 : LOG_PERROR), LOG_USER);
 	setlogmask(LOG_UPTO(debug.logpri - 1));
 
 	if (background && daemon(0, 0) < 0) {
 		syslog(LOG_ERR, "daemon: %m");
 		exit(1);
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	progargs = argv;
 	nprogargs = argc;
 
 	srandomdev();
 
 	snmp_serial_no = random();
 
 #ifdef USE_TCPWRAPPERS
 	/*
 	 * Initialize hosts_access(3) handler.
 	 */
 	request_init(&req, RQ_DAEMON, "snmpd", 0);
 	sock_methods(&req);
 #endif
 
 	/*
 	 * Initialize the tree.
 	 */
 	if ((tree = malloc(sizeof(struct snmp_node) * CTREE_SIZE)) == NULL) {
 		syslog(LOG_ERR, "%m");
 		exit(1);
 	}
 	memcpy(tree, ctree, sizeof(struct snmp_node) * CTREE_SIZE);
 	tree_size = CTREE_SIZE;
 
 	/*
 	 * Get standard communities
 	 */
 	(void)comm_define(1, "SNMP read", NULL, NULL);
 	(void)comm_define(2, "SNMP write", NULL, NULL);
 	community = COMM_INITIALIZE;
 
 	trap_reqid = reqid_allocate(512, NULL);
 
 	if (config_file[0] == '\0')
 		snprintf(config_file, sizeof(config_file), PATH_CONFIG, prefix);
 
 	init_actvals();
 	init_snmpd_engine();
 
 	this_tick = get_ticks();
 	start_tick = this_tick;
 
 	/* start transports */
 	if (atexit(trans_stop) == -1) {
 		syslog(LOG_ERR, "atexit failed: %m");
 		exit(1);
 	}
 	if (udp_trans.start() != SNMP_ERR_NOERROR)
 		syslog(LOG_WARNING, "cannot start UDP transport");
 	if (lsock_trans.start() != SNMP_ERR_NOERROR)
 		syslog(LOG_WARNING, "cannot start LSOCK transport");
 
 #ifdef USE_LIBBEGEMOT
 	if (debug.evdebug > 0)
 		rpoll_trace = 1;
 #else
 	if (evCreate(&evctx)) {
 		syslog(LOG_ERR, "evCreate: %m");
 		exit(1);
 	}
 	if (debug.evdebug > 0)
 		evSetDebug(evctx, 10, stderr);
 #endif
 
 	if (engine_file[0] == '\0')
 		snprintf(engine_file, sizeof(engine_file), PATH_ENGINE, prefix);
 
 	if (read_config(config_file, NULL)) {
 		syslog(LOG_ERR, "error in config file");
 		exit(1);
 	}
 
 	TAILQ_FOREACH(t, &transport_list, link)
 		TAILQ_FOREACH(p, &t->table, link)
 			t->vtab->init_port(p);
 
 	init_sigs();
 
 	if (pid_file[0] == '\0')
 		snprintf(pid_file, sizeof(pid_file), PATH_PID, prefix);
 
 	if ((fp = fopen(pid_file, "w")) != NULL) {
 		fprintf(fp, "%u", getpid());
 		fclose(fp);
 		if (atexit(term) == -1) {
 			syslog(LOG_ERR, "atexit failed: %m");
 			(void)remove(pid_file);
 			exit(0);
 		}
 	}
 
 	if (or_register(&oid_snmpMIB, "The MIB module for SNMPv2 entities.",
 	    NULL) == 0) {
 		syslog(LOG_ERR, "cannot register SNMPv2 MIB");
 		exit(1);
 	}
 	if (or_register(&oid_begemotSnmpd, "The MIB module for the Begemot SNMPd.",
 	    NULL) == 0) {
 		syslog(LOG_ERR, "cannot register begemotSnmpd MIB");
 		exit(1);
 	}
 
 	while ((m = TAILQ_FIRST(&modules_start)) != NULL) {
 		m->flags &= ~LM_ONSTARTLIST;
 		TAILQ_REMOVE(&modules_start, m, start);
 		lm_start(m);
 	}
 
 	snmp_send_trap(&oid_coldStart, (struct snmp_value *)NULL);
 
 	for (;;) {
 #ifndef USE_LIBBEGEMOT
 		evEvent event;
 #endif
 		struct lmodule *mod;
 
 		TAILQ_FOREACH(mod, &lmodules, link)
 			if (mod->config->idle != NULL)
 				(*mod->config->idle)();
 
 #ifndef USE_LIBBEGEMOT
 		if (evGetNext(evctx, &event, EV_WAIT) == 0) {
 			if (evDispatch(evctx, event))
 				syslog(LOG_ERR, "evDispatch: %m");
 		} else if (errno != EINTR) {
 			syslog(LOG_ERR, "evGetNext: %m");
 			exit(1);
 		}
 #else
 		poll_dispatch(1);
 #endif
 
 		if (work != 0) {
 			block_sigs();
 			if (work & WORK_DOINFO) {
 #ifdef USE_LIBBEGEMOT
 				info_func();
 #else
 				if (evWaitFor(evctx, &work, info_func,
 				    NULL, NULL) == -1) {
 					syslog(LOG_ERR, "evWaitFor: %m");
 					exit(1);
 				}
 #endif
 			}
 			if (work & WORK_RECONFIG) {
 #ifdef USE_LIBBEGEMOT
 				config_func();
 #else
 				if (evWaitFor(evctx, &work, config_func,
 				    NULL, NULL) == -1) {
 					syslog(LOG_ERR, "evWaitFor: %m");
 					exit(1);
 				}
 #endif
 			}
 			work = 0;
 			unblock_sigs();
 #ifndef USE_LIBBEGEMOT
 			if (evDo(evctx, &work) == -1) {
 				syslog(LOG_ERR, "evDo: %m");
 				exit(1);
 			}
 #endif
 		}
 	}
 
 	return (0);
 }
 
 uint64_t
 get_ticks(void)
 {
 	struct timeval tv;
 	uint64_t ret;
 
 	if (gettimeofday(&tv, NULL))
 		abort();
 	ret = tv.tv_sec * 100ULL + tv.tv_usec / 10000ULL;
 	return (ret);
 }
 
 /*
  * Timer support
  */
 
 /*
  * Trampoline for the non-repeatable timers.
  */
 #ifdef USE_LIBBEGEMOT
 static void
 tfunc(int tid __unused, void *uap)
 #else
 static void
 tfunc(evContext ctx __unused, void *uap, struct timespec due __unused,
 	struct timespec inter __unused)
 #endif
 {
 	struct timer *tp = uap;
 
 	LIST_REMOVE(tp, link);
 	tp->func(tp->udata);
 	free(tp);
 }
 
 /*
  * Trampoline for the repeatable timers.
  */
 #ifdef USE_LIBBEGEMOT
 static void
 trfunc(int tid __unused, void *uap)
 #else
 static void
 trfunc(evContext ctx __unused, void *uap, struct timespec due __unused,
 	struct timespec inter __unused)
 #endif
 {
 	struct timer *tp = uap;
 
 	tp->func(tp->udata);
 }
 
 /*
  * Start a one-shot timer
  */
 void *
 timer_start(u_int ticks, void (*func)(void *), void *udata, struct lmodule *mod)
 {
 	struct timer *tp;
 #ifndef USE_LIBBEGEMOT
 	struct timespec due;
 #endif
 
 	if ((tp = malloc(sizeof(struct timer))) == NULL) {
 		syslog(LOG_CRIT, "out of memory for timer");
 		exit(1);
 	}
 
 #ifndef USE_LIBBEGEMOT
 	due = evAddTime(evNowTime(),
 	    evConsTime(ticks / 100, (ticks % 100) * 10000));
 #endif
 
 	tp->udata = udata;
 	tp->owner = mod;
 	tp->func = func;
 
 	LIST_INSERT_HEAD(&timer_list, tp, link);
 
 #ifdef USE_LIBBEGEMOT
 	if ((tp->id = poll_start_timer(ticks * 10, 0, tfunc, tp)) < 0) {
 		syslog(LOG_ERR, "cannot set timer: %m");
 		exit(1);
 	}
 #else
 	if (evSetTimer(evctx, tfunc, tp, due, evConsTime(0, 0), &tp->id)
 	    == -1) {
 		syslog(LOG_ERR, "cannot set timer: %m");
 		exit(1);
 	}
 #endif
 	return (tp);
 }
 
 /*
  * Start a repeatable timer. When used with USE_LIBBEGEMOT the first argument
  * is currently ignored and the initial number of ticks is set to the
  * repeat number of ticks.
  */
 void *
 timer_start_repeat(u_int ticks __unused, u_int repeat_ticks,
     void (*func)(void *), void *udata, struct lmodule *mod)
 {
 	struct timer *tp;
 #ifndef USE_LIBBEGEMOT
 	struct timespec due;
 	struct timespec inter;
 #endif
 
 	if ((tp = malloc(sizeof(struct timer))) == NULL) {
 		syslog(LOG_CRIT, "out of memory for timer");
 		exit(1);
 	}
 
 #ifndef USE_LIBBEGEMOT
 	due = evAddTime(evNowTime(),
 	    evConsTime(ticks / 100, (ticks % 100) * 10000));
 	inter = evConsTime(repeat_ticks / 100, (repeat_ticks % 100) * 10000);
 #endif
 
 	tp->udata = udata;
 	tp->owner = mod;
 	tp->func = func;
 
 	LIST_INSERT_HEAD(&timer_list, tp, link);
 
 #ifdef USE_LIBBEGEMOT
 	if ((tp->id = poll_start_timer(repeat_ticks * 10, 1, trfunc, tp)) < 0) {
 		syslog(LOG_ERR, "cannot set timer: %m");
 		exit(1);
 	}
 #else
 	if (evSetTimer(evctx, trfunc, tp, due, inter, &tp->id) == -1) {
 		syslog(LOG_ERR, "cannot set timer: %m");
 		exit(1);
 	}
 #endif
 	return (tp);
 }
 
 /*
  * Stop a timer.
  */
 void
 timer_stop(void *p)
 {
 	struct timer *tp = p;
 
 	LIST_REMOVE(tp, link);
 #ifdef USE_LIBBEGEMOT
 	poll_stop_timer(tp->id);
 #else
 	if (evClearTimer(evctx, tp->id) == -1) {
 		syslog(LOG_ERR, "cannot stop timer: %m");
 		exit(1);
 	}
 #endif
 	free(p);
 }
 
 static void
 timer_flush(struct lmodule *mod)
 {
 	struct timer *t, *t1;
 
 	t = LIST_FIRST(&timer_list);
 	while (t != NULL) {
 		t1 = LIST_NEXT(t, link);
 		if (t->owner == mod)
 			timer_stop(t);
 		t = t1;
 	}
 }
 
 static void
 snmp_printf_func(const char *fmt, ...)
 {
 	va_list ap;
 	static char *pend = NULL;
 	char *ret, *new;
 
 	va_start(ap, fmt);
 	vasprintf(&ret, fmt, ap);
 	va_end(ap);
 
 	if (ret == NULL)
 		return;
 	if (pend != NULL) {
 		if ((new = realloc(pend, strlen(pend) + strlen(ret) + 1))
 		    == NULL) {
 			free(ret);
 			return;
 		}
 		pend = new;
 		strcat(pend, ret);
 		free(ret);
 	} else
 		pend = ret;
 
 	while ((ret = strchr(pend, '\n')) != NULL) {
 		*ret = '\0';
 		syslog(LOG_DEBUG, "%s", pend);
 		if (strlen(ret + 1) == 0) {
 			free(pend);
 			pend = NULL;
 			break;
 		}
 		strcpy(pend, ret + 1);
 	}
 }
 
 static void
 snmp_error_func(const char *err, ...)
 {
 	char errbuf[1000];
 	va_list ap;
 
 	if (!(snmp_trace & LOG_SNMP_ERRORS))
 		return;
 
 	va_start(ap, err);
 	snprintf(errbuf, sizeof(errbuf), "SNMP: ");
 	vsnprintf(errbuf + strlen(errbuf),
 	    sizeof(errbuf) - strlen(errbuf), err, ap);
 	va_end(ap);
 
 	syslog(LOG_ERR, "%s", errbuf);
 }
 
 static void
 snmp_debug_func(const char *err, ...)
 {
 	char errbuf[1000];
 	va_list ap;
 
 	va_start(ap, err);
 	snprintf(errbuf, sizeof(errbuf), "SNMP: ");
 	vsnprintf(errbuf+strlen(errbuf), sizeof(errbuf)-strlen(errbuf),
 	    err, ap);
 	va_end(ap);
 
 	syslog(LOG_DEBUG, "%s", errbuf);
 }
 
 static void
 asn_error_func(const struct asn_buf *b, const char *err, ...)
 {
 	char errbuf[1000];
 	va_list ap;
 	u_int i;
 
 	if (!(snmp_trace & LOG_ASN1_ERRORS))
 		return;
 
 	va_start(ap, err);
 	snprintf(errbuf, sizeof(errbuf), "ASN.1: ");
 	vsnprintf(errbuf + strlen(errbuf),
 	    sizeof(errbuf) - strlen(errbuf), err, ap);
 	va_end(ap);
 
 	if (b != NULL) {
 		snprintf(errbuf + strlen(errbuf),
 		    sizeof(errbuf) - strlen(errbuf), " at");
 		for (i = 0; b->asn_len > i; i++)
 			snprintf(errbuf + strlen(errbuf),
 			    sizeof(errbuf) - strlen(errbuf),
 			    " %02x", b->asn_cptr[i]);
 	}
 
 	syslog(LOG_ERR, "%s", errbuf);
 }
 
 /*
  * Create a new community
  */
 u_int
 comm_define(u_int priv, const char *descr, struct lmodule *owner,
     const char *str)
 {
 	struct community *c, *p;
 	u_int ncomm;
 
 	/* generate an identifier */
 	do {
 		if ((ncomm = next_community_index++) == UINT_MAX)
 			next_community_index = 1;
 		TAILQ_FOREACH(c, &community_list, link)
 			if (c->value == ncomm)
 				break;
 	} while (c != NULL);
 
 	if ((c = malloc(sizeof(struct community))) == NULL) {
 		syslog(LOG_ERR, "comm_define: %m");
 		return (0);
 	}
 	c->owner = owner;
 	c->value = ncomm;
 	c->descr = descr;
 	c->string = NULL;
 	c->private = priv;
 
 	if (str != NULL) {
 		if((c->string = malloc(strlen(str)+1)) == NULL) {
 			free(c);
 			return (0);
 		}
 		strcpy(c->string, str);
 	}
 
 	/* make index */
 	if (c->owner == NULL) {
 		c->index.len = 1;
 		c->index.subs[0] = 0;
 	} else {
 		c->index = c->owner->index;
 	}
 	c->index.subs[c->index.len++] = c->private;
 
 	/*
 	 * Insert ordered
 	 */
 	TAILQ_FOREACH(p, &community_list, link) {
 		if (asn_compare_oid(&p->index, &c->index) > 0) {
 			TAILQ_INSERT_BEFORE(p, c, link);
 			break;
 		}
 	}
 	if (p == NULL)
 		TAILQ_INSERT_TAIL(&community_list, c, link);
 	return (c->value);
 }
 
 const char *
 comm_string(u_int ncomm)
 {
 	struct community *p;
 
 	TAILQ_FOREACH(p, &community_list, link)
 		if (p->value == ncomm)
 			return (p->string);
 	return (NULL);
 }
 
 /*
  * Delete all communities allocated by a module
  */
 static void
 comm_flush(struct lmodule *mod)
 {
 	struct community *p, *p1;
 
 	p = TAILQ_FIRST(&community_list);
 	while (p != NULL) {
 		p1 = TAILQ_NEXT(p, link);
 		if (p->owner == mod) {
 			free(p->string);
 			TAILQ_REMOVE(&community_list, p, link);
 			free(p);
 		}
 		p = p1;
 	}
 }
 
 /*
  * Request ID handling.
  *
  * Allocate a new range of request ids. Use a first fit algorithm.
  */
 u_int
 reqid_allocate(int size, struct lmodule *mod)
 {
 	u_int type;
 	struct idrange *r, *r1;
 
 	if (size <= 0 || size > INT32_MAX) {
 		syslog(LOG_CRIT, "%s: size out of range: %d", __func__, size);
 		return (0);
 	}
 	/* allocate a type id */
 	do {
 		if ((type = next_idrange++) == UINT_MAX)
 			next_idrange = 1;
 		TAILQ_FOREACH(r, &idrange_list, link)
 			if (r->type == type)
 				break;
 	} while(r != NULL);
 
 	/* find a range */
 	if (TAILQ_EMPTY(&idrange_list))
 		r = NULL;
 	else {
 		r = TAILQ_FIRST(&idrange_list);
 		if (r->base < size) {
 			while((r1 = TAILQ_NEXT(r, link)) != NULL) {
 				if (r1->base - (r->base + r->size) >= size)
 					break;
 				r = r1;
 			}
 			r = r1;
 		}
 		if (r == NULL) {
 			r1 = TAILQ_LAST(&idrange_list, idrange_list);
 			if (INT32_MAX - size + 1 < r1->base + r1->size) {
 				syslog(LOG_ERR, "out of id ranges (%u)", size);
 				return (0);
 			}
 		}
 	}
 
 	/* allocate structure */
 	if ((r1 = malloc(sizeof(struct idrange))) == NULL) {
 		syslog(LOG_ERR, "%s: %m", __FUNCTION__);
 		return (0);
 	}
 
 	r1->type = type;
 	r1->size = size;
 	r1->owner = mod;
 	if (TAILQ_EMPTY(&idrange_list) || r == TAILQ_FIRST(&idrange_list)) {
 		r1->base = 0;
 		TAILQ_INSERT_HEAD(&idrange_list, r1, link);
 	} else if (r == NULL) {
 		r = TAILQ_LAST(&idrange_list, idrange_list);
 		r1->base = r->base + r->size;
 		TAILQ_INSERT_TAIL(&idrange_list, r1, link);
 	} else {
 		r = TAILQ_PREV(r, idrange_list, link);
 		r1->base = r->base + r->size;
 		TAILQ_INSERT_AFTER(&idrange_list, r, r1, link);
 	}
 	r1->next = r1->base;
 
 	return (type);
 }
 
 int32_t
 reqid_next(u_int type)
 {
 	struct idrange *r;
 	int32_t id;
 
 	TAILQ_FOREACH(r, &idrange_list, link)
 		if (r->type == type)
 			break;
 	if (r == NULL) {
 		syslog(LOG_CRIT, "wrong idrange type");
 		abort();
 	}
 	if ((id = r->next++) == r->base + (r->size - 1))
 		r->next = r->base;
 	return (id);
 }
 
 int32_t
 reqid_base(u_int type)
 {
 	struct idrange *r;
 
 	TAILQ_FOREACH(r, &idrange_list, link)
 		if (r->type == type)
 			return (r->base);
 	syslog(LOG_CRIT, "wrong idrange type");
 	abort();
 }
 
 u_int
 reqid_type(int32_t reqid)
 {
 	struct idrange *r;
 
 	TAILQ_FOREACH(r, &idrange_list, link)
 		if (reqid >= r->base && reqid <= r->base + (r->size - 1))
 			return (r->type);
 	return (0);
 }
 
 int
 reqid_istype(int32_t reqid, u_int type)
 {
 	return (reqid_type(reqid) == type);
 }
 
 /*
  * Delete all communities allocated by a module
  */
 static void
 reqid_flush(struct lmodule *mod)
 {
 	struct idrange *p, *p1;
 
 	p = TAILQ_FIRST(&idrange_list);
 	while (p != NULL) {
 		p1 = TAILQ_NEXT(p, link);
 		if (p->owner == mod) {
 			TAILQ_REMOVE(&idrange_list, p, link);
 			free(p);
 		}
 		p = p1;
 	}
 }
 
 /*
  * Merge the given tree for the given module into the main tree.
  */
 static int
 compare_node(const void *v1, const void *v2)
 {
 	const struct snmp_node *n1 = v1;
 	const struct snmp_node *n2 = v2;
 
 	return (asn_compare_oid(&n1->oid, &n2->oid));
 }
 static int
 tree_merge(const struct snmp_node *ntree, u_int nsize, struct lmodule *mod)
 {
 	struct snmp_node *xtree;
 	u_int i;
 
 	xtree = realloc(tree, sizeof(*tree) * (tree_size + nsize));
 	if (xtree == NULL) {
 		syslog(LOG_ERR, "tree_merge: %m");
 		return (-1);
 	}
 	tree = xtree;
 	memcpy(&tree[tree_size], ntree, sizeof(*tree) * nsize);
 
 	for (i = 0; i < nsize; i++)
 		tree[tree_size + i].tree_data = mod;
 
 	tree_size += nsize;
 
 	qsort(tree, tree_size, sizeof(tree[0]), compare_node);
 
 	return (0);
 }
 
 /*
  * Remove all nodes belonging to the loadable module
  */
 static void
 tree_unmerge(struct lmodule *mod)
 {
 	u_int s, d;
 
 	for(s = d = 0; s < tree_size; s++)
 		if (tree[s].tree_data != mod) {
 			if (s != d)
 				tree[d] = tree[s];
 			d++;
 		}
 	tree_size = d;
 }
 
 /*
  * Loadable modules
  */
 struct lmodule *
 lm_load(const char *path, const char *section)
 {
 	struct lmodule *m;
 	int err;
 	int i;
 	char *av[MAX_MOD_ARGS + 1];
 	int ac;
 	u_int u;
 
 	if ((m = malloc(sizeof(*m))) == NULL) {
 		syslog(LOG_ERR, "lm_load: %m");
 		return (NULL);
 	}
 	m->handle = NULL;
 	m->flags = 0;
 	strcpy(m->section, section);
 
 	if ((m->path = malloc(strlen(path) + 1)) == NULL) {
 		syslog(LOG_ERR, "lm_load: %m");
 		goto err;
 	}
 	strcpy(m->path, path);
 
 	/*
 	 * Make index
 	 */
 	m->index.subs[0] = strlen(section);
 	m->index.len = m->index.subs[0] + 1;
 	for (u = 0; u < m->index.subs[0]; u++)
 		m->index.subs[u + 1] = section[u];
 
 	/*
 	 * Load the object file and locate the config structure
 	 */
 	if ((m->handle = dlopen(m->path, RTLD_NOW|RTLD_GLOBAL)) == NULL) {
 		syslog(LOG_ERR, "lm_load: open %s", dlerror());
 		goto err;
 	}
 
 	if ((m->config = dlsym(m->handle, "config")) == NULL) {
 		syslog(LOG_ERR, "lm_load: no 'config' symbol %s", dlerror());
 		goto err;
 	}
 
 	/*
 	 * Insert it into the right place
 	 */
 	INSERT_OBJECT_OID(m, &lmodules);
 
 	/* preserve order */
 	if (community == COMM_INITIALIZE) {
 		m->flags |= LM_ONSTARTLIST;
 		TAILQ_INSERT_TAIL(&modules_start, m, start);
 	}
 
 	/*
 	 * make the argument vector.
 	 */
 	ac = 0;
 	for (i = 0; i < nprogargs; i++) {
 		if (strlen(progargs[i]) >= strlen(section) + 1 &&
 		    strncmp(progargs[i], section, strlen(section)) == 0 &&
 		    progargs[i][strlen(section)] == ':') {
 			if (ac == MAX_MOD_ARGS) {
 				syslog(LOG_WARNING, "too many arguments for "
 				    "module '%s", section);
 				break;
 			}
 			av[ac++] = &progargs[i][strlen(section)+1];
 		}
 	}
 	av[ac] = NULL;
 
 	/*
 	 * Run the initialization function
 	 */
 	if ((err = (*m->config->init)(m, ac, av)) != 0) {
 		syslog(LOG_ERR, "lm_load: init failed: %d", err);
 		TAILQ_REMOVE(&lmodules, m, link);
 		goto err;
 	}
 
 	return (m);
 
   err:
 	if ((m->flags & LM_ONSTARTLIST) != 0)
 		TAILQ_REMOVE(&modules_start, m, start);
 	if (m->handle)
 		dlclose(m->handle);
 	free(m->path);
 	free(m);
 	return (NULL);
 }
 
 /*
  * Start a module
  */
 void
 lm_start(struct lmodule *mod)
 {
 	const struct lmodule *m;
 
 	/*
 	 * Merge tree. If this fails, unload the module.
 	 */
 	if (tree_merge(mod->config->tree, mod->config->tree_size, mod)) {
 		lm_unload(mod);
 		return;
 	}
 
 	/*
 	 * Read configuration
 	 */
 	if (read_config(config_file, mod)) {
 		syslog(LOG_ERR, "error in config file");
 		lm_unload(mod);
 		return;
 	}
 	if (mod->config->start)
 		(*mod->config->start)();
 
 	mod->flags |= LM_STARTED;
 
 	/*
 	 * Inform other modules
 	 */
 	TAILQ_FOREACH(m, &lmodules, link)
 		if (m->config->loading)
 			(*m->config->loading)(mod, 1);
 }
 
 
 /*
  * Unload a module.
  */
 void
 lm_unload(struct lmodule *m)
 {
 	int err;
 	const struct lmodule *mod;
 
 	TAILQ_REMOVE(&lmodules, m, link);
 	if (m->flags & LM_ONSTARTLIST)
 		TAILQ_REMOVE(&modules_start, m, start);
 	tree_unmerge(m);
 
 	if ((m->flags & LM_STARTED) && m->config->fini &&
 	    (err = (*m->config->fini)()) != 0)
 		syslog(LOG_WARNING, "lm_unload(%s): fini %d", m->section, err);
 
 	comm_flush(m);
 	reqid_flush(m);
 	timer_flush(m);
 	fd_flush(m);
 
 	dlclose(m->handle);
 	free(m->path);
 
 	/*
 	 * Inform other modules
 	 */
 	TAILQ_FOREACH(mod, &lmodules, link)
 		if (mod->config->loading)
 			(*mod->config->loading)(m, 0);
 
 	free(m);
 }
 
 /*
  * Register an object resource and return the index (or 0 on failures)
  */
 u_int
 or_register(const struct asn_oid *or, const char *descr, struct lmodule *mod)
 {
 	struct objres *objres, *or1;
 	u_int idx;
 
 	/* find a free index */
 	idx = 1;
 	for (objres = TAILQ_FIRST(&objres_list);
 	     objres != NULL;
 	     objres = TAILQ_NEXT(objres, link)) {
 		if ((or1 = TAILQ_NEXT(objres, link)) == NULL ||
 		    or1->index > objres->index + 1) {
 			idx = objres->index + 1;
 			break;
 		}
 	}
 
 	if ((objres = malloc(sizeof(*objres))) == NULL)
 		return (0);
 
 	objres->index = idx;
 	objres->oid = *or;
 	strlcpy(objres->descr, descr, sizeof(objres->descr));
 	objres->uptime = (uint32_t)(get_ticks() - start_tick);
 	objres->module = mod;
 
 	INSERT_OBJECT_INT(objres, &objres_list);
 
 	systemg.or_last_change = objres->uptime;
 
 	return (idx);
 }
 
 void
 or_unregister(u_int idx)
 {
 	struct objres *objres;
 
 	TAILQ_FOREACH(objres, &objres_list, link)
 		if (objres->index == idx) {
 			TAILQ_REMOVE(&objres_list, objres, link);
 			free(objres);
 			return;
 		}
 }
 
 /*
  * RFC 3414 User-based Security Model support
  */
 
 struct snmpd_usmstat *
 bsnmpd_get_usm_stats(void)
 {
 	return (&snmpd_usmstats);
 }
 
 void
 bsnmpd_reset_usm_stats(void)
 {
 	memset(&snmpd_usmstats, 0, sizeof(snmpd_usmstats));
 }
 
 struct usm_user *
 usm_first_user(void)
 {
 	return (SLIST_FIRST(&usm_userlist));
 }
 
 struct usm_user *
 usm_next_user(struct usm_user *uuser)
 {
 	if (uuser == NULL)
 		return (NULL);
 
 	return (SLIST_NEXT(uuser, up));
 }
 
 struct usm_user *
 usm_find_user(uint8_t *engine, uint32_t elen, char *uname)
 {
 	struct usm_user *uuser;
 
 	SLIST_FOREACH(uuser, &usm_userlist, up)
 		if (uuser->user_engine_len == elen &&
 		    memcmp(uuser->user_engine_id, engine, elen) == 0 &&
 		    strlen(uuser->suser.sec_name) == strlen(uname) &&
 		    strcmp(uuser->suser.sec_name, uname) == 0)
 			break;
 
 	return (uuser);
 }
 
 static int
 usm_compare_user(struct usm_user *u1, struct usm_user *u2)
 {
 	uint32_t i;
 
 	if (u1->user_engine_len < u2->user_engine_len)
 		return (-1);
 	if (u1->user_engine_len > u2->user_engine_len)
 		return (1);
 
 	for (i = 0; i < u1->user_engine_len; i++) {
 		if (u1->user_engine_id[i] < u2->user_engine_id[i])
 			return (-1);
 		if (u1->user_engine_id[i] > u2->user_engine_id[i])
 			return (1);
 	}
 
 	if (strlen(u1->suser.sec_name) < strlen(u2->suser.sec_name))
 		return (-1);
 	if (strlen(u1->suser.sec_name) > strlen(u2->suser.sec_name))
 		return (1);
 
 	for (i = 0; i < strlen(u1->suser.sec_name); i++) {
 		if (u1->suser.sec_name[i] < u2->suser.sec_name[i])
 			return (-1);
 		if (u1->suser.sec_name[i] > u2->suser.sec_name[i])
 			return (1);
 	}
 
 	return (0);
 }
 
 struct usm_user *
 usm_new_user(uint8_t *eid, uint32_t elen, char *uname)
 {
 	int cmp;
 	struct usm_user *uuser, *temp, *prev;
 
 	for (uuser = usm_first_user(); uuser != NULL;
 	    (uuser = usm_next_user(uuser))) {
 		if (uuser->user_engine_len == elen &&
 		    strlen(uname) == strlen(uuser->suser.sec_name) &&
 		    strcmp(uname, uuser->suser.sec_name) == 0 &&
 		    memcmp(eid, uuser->user_engine_id, elen) == 0)
 			return (NULL);
 	}
 
 	if ((uuser = (struct usm_user *)malloc(sizeof(*uuser))) == NULL)
 		return (NULL);
 
-	memset(uuser, 0, sizeof(struct usm_user));
+	memset(uuser, 0, sizeof(*uuser));
 	strlcpy(uuser->suser.sec_name, uname, SNMP_ADM_STR32_SIZ);
 	memcpy(uuser->user_engine_id, eid, elen);
 	uuser->user_engine_len = elen;
 
 	if ((prev = SLIST_FIRST(&usm_userlist)) == NULL ||
 	    usm_compare_user(uuser, prev) < 0) {
 		SLIST_INSERT_HEAD(&usm_userlist, uuser, up);
 		return (uuser);
 	}
 
 	SLIST_FOREACH(temp, &usm_userlist, up) {
 		if ((cmp = usm_compare_user(uuser, temp)) <= 0)
 			break;
 		prev = temp;
 	}
 
 	if (temp == NULL || cmp < 0)
 		SLIST_INSERT_AFTER(prev, uuser, up);
 	else if (cmp > 0)
 		SLIST_INSERT_AFTER(temp, uuser, up);
 	else {
 		syslog(LOG_ERR, "User %s exists", uuser->suser.sec_name);
 		free(uuser);
 		return (NULL);
 	}
 
 	return (uuser);
 }
 
 void
 usm_delete_user(struct usm_user *uuser)
 {
 	SLIST_REMOVE(&usm_userlist, uuser, usm_user, up);
 	free(uuser);
 }
 
 void
 usm_flush_users(void)
 {
 	struct usm_user *uuser;
 
 	while ((uuser = SLIST_FIRST(&usm_userlist)) != NULL) {
 		SLIST_REMOVE_HEAD(&usm_userlist, up);
 		free(uuser);
 	}
 
 	SLIST_INIT(&usm_userlist);
 }
 
 /*
  * RFC 3415 View-based Access Control Model support
  */
 struct vacm_user *
 vacm_first_user(void)
 {
 	return (SLIST_FIRST(&vacm_userlist));
 }
 
 struct vacm_user *
 vacm_next_user(struct vacm_user *vuser)
 {
 	if (vuser == NULL)
 		return (NULL);
 
 	return (SLIST_NEXT(vuser, vvu));
 }
 
 static int
 vacm_compare_user(struct vacm_user *v1, struct vacm_user *v2)
 {
 	uint32_t i;
 
 	if (v1->sec_model < v2->sec_model)
 		return (-1);
 	if (v1->sec_model > v2->sec_model)
 		return (1);
 
 	if (strlen(v1->secname) < strlen(v2->secname))
 		return (-1);
 	if (strlen(v1->secname) > strlen(v2->secname))
 		return (1);
 
 	for (i = 0; i < strlen(v1->secname); i++) {
 		if (v1->secname[i] < v2->secname[i])
 			return (-1);
 		if (v1->secname[i] > v2->secname[i])
 			return (1);
 	}
 
 	return (0);
 }
 
 struct vacm_user *
 vacm_new_user(int32_t smodel, char *uname)
 {
 	int cmp;
 	struct vacm_user *user, *temp, *prev;
 
 	SLIST_FOREACH(user, &vacm_userlist, vvu)
 		if (strcmp(uname, user->secname) == 0 &&
 		    smodel == user->sec_model)
 			return (NULL);
 
 	if ((user = (struct vacm_user *)malloc(sizeof(*user))) == NULL)
 		return (NULL);
 
 	memset(user, 0, sizeof(*user));
 	user->group = &vacm_default_group;
 	SLIST_INSERT_HEAD(&vacm_default_group.group_users, user, vvg);
 	user->sec_model = smodel;
 	strlcpy(user->secname, uname, sizeof(user->secname));
 
 	if ((prev = SLIST_FIRST(&vacm_userlist)) == NULL ||
 	    vacm_compare_user(user, prev) < 0) {
 		SLIST_INSERT_HEAD(&vacm_userlist, user, vvu);
 		return (user);
 	}
 
 	SLIST_FOREACH(temp, &vacm_userlist, vvu) {
 		if ((cmp = vacm_compare_user(user, temp)) <= 0)
 			break;
 		prev = temp;
 	}
 
 	if (temp == NULL || cmp < 0)
 		SLIST_INSERT_AFTER(prev, user, vvu);
 	else if (cmp > 0)
 		SLIST_INSERT_AFTER(temp, user, vvu);
 	else {
 		syslog(LOG_ERR, "User %s exists", user->secname);
 		free(user);
 		return (NULL);
 	}
 
 	return (user);
 }
 
 int
 vacm_delete_user(struct vacm_user *user)
 {
 	if (user->group != NULL && user->group != &vacm_default_group) {
 		SLIST_REMOVE(&user->group->group_users, user, vacm_user, vvg);
 		if (SLIST_EMPTY(&user->group->group_users)) {
 			SLIST_REMOVE(&vacm_grouplist, user->group,
 			    vacm_group, vge);
 			free(user->group);
 		}
 	}
 
 	SLIST_REMOVE(&vacm_userlist, user, vacm_user, vvu);
 	free(user);
 
 	return (0);
 }
 
 int
 vacm_user_set_group(struct vacm_user *user, u_char *octets, u_int len)
 {
 	struct vacm_group *group;
 
 	if (len >= SNMP_ADM_STR32_SIZ)
 		return (-1);
 
 	SLIST_FOREACH(group, &vacm_grouplist, vge)
 		if (strlen(group->groupname) == len &&
 		    memcmp(octets, group->groupname, len) == 0)
 			break;
 
 	if (group == NULL) {
 		if ((group = (struct vacm_group *)malloc(sizeof(*group))) == NULL)
 			return (-1);
 		memset(group, 0, sizeof(*group));
 		memcpy(group->groupname, octets, len);
 		group->groupname[len] = '\0';
 		SLIST_INSERT_HEAD(&vacm_grouplist, group, vge);
 	}
 
 	SLIST_REMOVE(&user->group->group_users, user, vacm_user, vvg);
 	SLIST_INSERT_HEAD(&group->group_users, user, vvg);
 	user->group = group;
 
 	return (0);
 }
 
 void
 vacm_groups_init(void)
 {
 	SLIST_INSERT_HEAD(&vacm_grouplist, &vacm_default_group, vge);
 }
 
 struct vacm_access *
 vacm_first_access_rule(void)
 {
 	return (TAILQ_FIRST(&vacm_accesslist));
 }
 
 struct vacm_access *
 vacm_next_access_rule(struct vacm_access *acl)
 {
 	if (acl == NULL)
 		return (NULL);
 
 	return (TAILQ_NEXT(acl, vva));
 }
 
 static int
 vacm_compare_access_rule(struct vacm_access *v1, struct vacm_access *v2)
 {
 	uint32_t i;
 
 	if (strlen(v1->group->groupname) < strlen(v2->group->groupname))
 		return (-1);
 	if (strlen(v1->group->groupname) > strlen(v2->group->groupname))
 		return (1);
 
 	for (i = 0; i < strlen(v1->group->groupname); i++) {
 		if (v1->group->groupname[i] < v2->group->groupname[i])
 			return (-1);
 		if (v1->group->groupname[i] > v2->group->groupname[i])
 			return (1);
 	}
 
 	if (strlen(v1->ctx_prefix) < strlen(v2->ctx_prefix))
 		return (-1);
 	if (strlen(v1->ctx_prefix) > strlen(v2->ctx_prefix))
 		return (1);
 
 	for (i = 0; i < strlen(v1->ctx_prefix); i++) {
 		if (v1->ctx_prefix[i] < v2->ctx_prefix[i])
 			return (-1);
 		if (v1->ctx_prefix[i] > v2->ctx_prefix[i])
 			return (1);
 	}
 
 	if (v1->sec_model < v2->sec_model)
 		return (-1);
 	if (v1->sec_model > v2->sec_model)
 		return (1);
 
 	if (v1->sec_level < v2->sec_level)
 		return (-1);
 	if (v1->sec_level > v2->sec_level)
 		return (1);
 
 	return (0);
 }
 
 struct vacm_access *
 vacm_new_access_rule(char *gname, char *cprefix, int32_t smodel, int32_t slevel)
 {
 	struct vacm_group *group;
 	struct vacm_access *acl, *temp;
 
 	TAILQ_FOREACH(acl, &vacm_accesslist, vva) {
 		if (acl->group == NULL)
 			continue;
 		if (strcmp(gname, acl->group->groupname) == 0 &&
 		    strcmp(cprefix, acl->ctx_prefix) == 0 &&
 		    acl->sec_model == smodel && acl->sec_level == slevel)
 			return (NULL);
 	}
 
 	/* Make sure the group exists */
 	SLIST_FOREACH(group, &vacm_grouplist, vge)
 		if (strcmp(gname, group->groupname) == 0)
 			break;
 
 	if (group == NULL)
 		return (NULL);
 
 	if ((acl = (struct vacm_access *)malloc(sizeof(*acl))) == NULL)
 		return (NULL);
 
 	memset(acl, 0, sizeof(*acl));
 	acl->group = group;
 	strlcpy(acl->ctx_prefix, cprefix, sizeof(acl->ctx_prefix));
 	acl->sec_model = smodel;
 	acl->sec_level = slevel;
 
 	if ((temp = TAILQ_FIRST(&vacm_accesslist)) == NULL ||
 	    vacm_compare_access_rule(acl, temp) < 0) {
 		TAILQ_INSERT_HEAD(&vacm_accesslist, acl, vva);
 		return (acl);
 	}
 
 	TAILQ_FOREACH(temp, &vacm_accesslist, vva)
 		if (vacm_compare_access_rule(acl, temp) < 0) {
 		    	TAILQ_INSERT_BEFORE(temp, acl, vva);
 			return (acl);
 		}
 
 	TAILQ_INSERT_TAIL(&vacm_accesslist, acl, vva);
 
 	return (acl);
 }
 
 int
 vacm_delete_access_rule(struct vacm_access *acl)
 {
 	TAILQ_REMOVE(&vacm_accesslist, acl, vva);
 	free(acl);
 
 	return (0);
 }
 
 struct vacm_view *
 vacm_first_view(void)
 {
 	return (SLIST_FIRST(&vacm_viewlist));
 }
 
 struct vacm_view *
 vacm_next_view(struct vacm_view *view)
 {
 	if (view == NULL)
 		return (NULL);
 
 	return (SLIST_NEXT(view, vvl));
 }
 
 static int
 vacm_compare_view(struct vacm_view *v1, struct vacm_view *v2)
 {
 	uint32_t i;
 
 	if (strlen(v1->viewname) < strlen(v2->viewname))
 		return (-1);
 	if (strlen(v1->viewname) > strlen(v2->viewname))
 		return (1);
 
 	for (i = 0; i < strlen(v1->viewname); i++) {
 		if (v1->viewname[i] < v2->viewname[i])
 			return (-1);
 		if (v1->viewname[i] > v2->viewname[i])
 			return (1);
 	}
 
 	return (asn_compare_oid(&v1->subtree, &v2->subtree));
 }
 
 struct vacm_view *
 vacm_new_view(char *vname, struct asn_oid *oid)
 {
 	int cmp;
 	struct vacm_view *view, *temp, *prev;
 
 	SLIST_FOREACH(view, &vacm_viewlist, vvl)
 		if (strcmp(vname, view->viewname) == 0)
 			return (NULL);
 
 	if ((view = (struct vacm_view *)malloc(sizeof(*view))) == NULL)
 		return (NULL);
 
 	memset(view, 0, sizeof(*view));
 	strlcpy(view->viewname, vname, sizeof(view->viewname));
 	asn_append_oid(&view->subtree, oid);
 
 	if ((prev = SLIST_FIRST(&vacm_viewlist)) == NULL ||
 	    vacm_compare_view(view, prev) < 0) {
 		SLIST_INSERT_HEAD(&vacm_viewlist, view, vvl);
 		return (view);
 	}
 
 	SLIST_FOREACH(temp, &vacm_viewlist, vvl) {
 		if ((cmp = vacm_compare_view(view, temp)) <= 0)
 			break;
 		prev = temp;
 	}
 
 	if (temp == NULL || cmp < 0)
 		SLIST_INSERT_AFTER(prev, view, vvl);
 	else if (cmp > 0)
 		SLIST_INSERT_AFTER(temp, view, vvl);
 	else {
 		syslog(LOG_ERR, "View %s exists", view->viewname);
 		free(view);
 		return (NULL);
 	}
 
 	return (view);
 }
 
 int
 vacm_delete_view(struct vacm_view *view)
 {
 	SLIST_REMOVE(&vacm_viewlist, view, vacm_view, vvl);
 	free(view);
 
 	return (0);
 }
 
 struct vacm_context *
 vacm_first_context(void)
 {
 	return (SLIST_FIRST(&vacm_contextlist));
 }
 
 struct vacm_context *
 vacm_next_context(struct vacm_context *vacmctx)
 {
 	if (vacmctx == NULL)
 		return (NULL);
 
 	return (SLIST_NEXT(vacmctx, vcl));
 }
 
 struct vacm_context *
 vacm_add_context(char *ctxname, int regid)
 {
 	int cmp;
 	struct vacm_context *ctx, *temp, *prev;
 
 	SLIST_FOREACH(ctx, &vacm_contextlist, vcl)
 		if (strcmp(ctxname, ctx->ctxname) == 0) {
 			syslog(LOG_ERR, "Context %s exists", ctx->ctxname);
 			return (NULL);
 		}
 
 	if ((ctx = (struct vacm_context *)malloc(sizeof(*ctx))) == NULL)
 		return (NULL);
 
 	memset(ctx, 0, sizeof(*ctx));
 	strlcpy(ctx->ctxname, ctxname, sizeof(ctx->ctxname));
 	ctx->regid = regid;
 
 	if ((prev = SLIST_FIRST(&vacm_contextlist)) == NULL ||
 	    strlen(ctx->ctxname) < strlen(prev->ctxname) ||
 	    strcmp(ctx->ctxname, prev->ctxname) < 0) {
 		SLIST_INSERT_HEAD(&vacm_contextlist, ctx, vcl);
 		return (ctx);
 	}
 
 	SLIST_FOREACH(temp, &vacm_contextlist, vcl) {
 		if (strlen(ctx->ctxname) < strlen(temp->ctxname) ||
 		    strcmp(ctx->ctxname, temp->ctxname) < 0) {
 		    	cmp = -1;
 			break;
 		}
 		prev = temp;
 	}
 
 	if (temp == NULL || cmp < 0)
 		SLIST_INSERT_AFTER(prev, ctx, vcl);
 	else if (cmp > 0)
 		SLIST_INSERT_AFTER(temp, ctx, vcl);
 	else {
 		syslog(LOG_ERR, "Context %s exists", ctx->ctxname);
 		free(ctx);
 		return (NULL);
 	}
 
 	return (ctx);
 }
 
 void
 vacm_flush_contexts(int regid)
 {
 	struct vacm_context *ctx, *temp;
 
 	SLIST_FOREACH_SAFE(ctx, &vacm_contextlist, vcl, temp)
 		if (ctx->regid == regid) {
 			SLIST_REMOVE(&vacm_contextlist, ctx, vacm_context, vcl);
 			free(ctx);
 		}
 }
Index: user/ngie/bsnmp_cleanup/sys/arm/arm/pmap-v6.c
===================================================================
--- user/ngie/bsnmp_cleanup/sys/arm/arm/pmap-v6.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/arm/arm/pmap-v6.c	(revision 298468)
@@ -1,6653 +1,6805 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1994 David Greenman
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org>
  * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_vm.h"
 #include "opt_pmap.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/malloc.h>
 #include <sys/vmmeter.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #else
 #include <sys/cpuset.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/physmem.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_reserv.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <machine/md_var.h>
 #include <machine/pmap_var.h>
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/sf_buf.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #ifndef DIAGNOSTIC
 #define PMAP_INLINE	__inline
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PMAP_DEBUG
 static void pmap_zero_page_check(vm_page_t m);
 void pmap_debug(int level);
 int pmap_pid_dump(int pid);
 
 #define PDEBUG(_lev_,_stat_) \
 	if (pmap_debug_level >= (_lev_)) \
 		((_stat_))
 #define dprintf printf
 int pmap_debug_level = 1;
 #else   /* PMAP_DEBUG */
 #define PDEBUG(_lev_,_stat_) /* Nothing */
 #define dprintf(x, arg...)
 #endif  /* PMAP_DEBUG */
 
 /*
  *  Level 2 page tables map definion ('max' is excluded).
  */
 
 #define PT2V_MIN_ADDRESS	((vm_offset_t)PT2MAP)
 #define PT2V_MAX_ADDRESS	((vm_offset_t)PT2MAP + PT2MAP_SIZE)
 
 #define UPT2V_MIN_ADDRESS	((vm_offset_t)PT2MAP)
 #define UPT2V_MAX_ADDRESS \
     ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT)))
 
 /*
  *  Promotion to a 1MB (PTE1) page mapping requires that the corresponding
  *  4KB (PTE2) page mappings have identical settings for the following fields:
  */
 #define PTE2_PROMOTE	(PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG |	\
 			 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W |		\
 			 PTE2_ATTR_MASK)
 
 #define PTE1_PROMOTE	(PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG |	\
 			 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W |		\
 			 PTE1_ATTR_MASK)
 
 #define ATTR_TO_L1(l2_attr)	((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \
 				 (((l2_attr) & L2_C)    ? L1_S_C    : 0) | \
 				 (((l2_attr) & L2_B)    ? L1_S_B    : 0) | \
 				 (((l2_attr) & PTE2_A)  ? PTE1_A    : 0) | \
 				 (((l2_attr) & PTE2_NM) ? PTE1_NM   : 0) | \
 				 (((l2_attr) & PTE2_S)  ? PTE1_S    : 0) | \
 				 (((l2_attr) & PTE2_NG) ? PTE1_NG   : 0) | \
 				 (((l2_attr) & PTE2_NX) ? PTE1_NX   : 0) | \
 				 (((l2_attr) & PTE2_RO) ? PTE1_RO   : 0) | \
 				 (((l2_attr) & PTE2_U)  ? PTE1_U    : 0) | \
 				 (((l2_attr) & PTE2_W)  ? PTE1_W    : 0))
 
 #define ATTR_TO_L2(l1_attr)	((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \
 				 (((l1_attr) & L1_S_C)    ? L2_C    : 0) | \
 				 (((l1_attr) & L1_S_B)    ? L2_B    : 0) | \
 				 (((l1_attr) & PTE1_A)    ? PTE2_A  : 0) | \
 				 (((l1_attr) & PTE1_NM)   ? PTE2_NM : 0) | \
 				 (((l1_attr) & PTE1_S)    ? PTE2_S  : 0) | \
 				 (((l1_attr) & PTE1_NG)   ? PTE2_NG : 0) | \
 				 (((l1_attr) & PTE1_NX)   ? PTE2_NX : 0) | \
 				 (((l1_attr) & PTE1_RO)   ? PTE2_RO : 0) | \
 				 (((l1_attr) & PTE1_U)    ? PTE2_U  : 0) | \
 				 (((l1_attr) & PTE1_W)    ? PTE2_W  : 0))
 
 /*
  *  PTE2 descriptors creation macros.
  */
 #define PTE2_ATTR_DEFAULT	vm_memattr_to_pte2(VM_MEMATTR_DEFAULT)
 #define PTE2_ATTR_PT		vm_memattr_to_pte2(pt_memattr)
 
 #define PTE2_KPT(pa)	PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT)
 #define PTE2_KPT_NG(pa)	PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT)
 
 #define PTE2_KRW(pa)	PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT)
 #define PTE2_KRO(pa)	PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT)
 
 #define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 /*
  *  The boot_pt1 is used temporary in very early boot stage as L1 page table.
  *  We can init many things with no memory allocation thanks to its static
  *  allocation and this brings two main advantages:
  *  (1) other cores can be started very simply,
  *  (2) various boot loaders can be supported as its arguments can be processed
  *      in virtual address space and can be moved to safe location before
  *      first allocation happened.
  *  Only disadvantage is that boot_pt1 is used only in very early boot stage.
  *  However, the table is uninitialized and so lays in bss. Therefore kernel
  *  image size is not influenced.
  *
  *  QQQ: In the future, maybe, boot_pt1 can be used for soft reset and
  *       CPU suspend/resume game.
  */
 extern pt1_entry_t boot_pt1[];
 
 vm_paddr_t base_pt1;
 pt1_entry_t *kern_pt1;
 pt2_entry_t *kern_pt2tab;
 pt2_entry_t *PT2MAP;
 
 static uint32_t ttb_flags;
 static vm_memattr_t pt_memattr;
 ttb_entry_t pmap_kern_ttb;
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 static vm_offset_t kernel_vm_end_new;
 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE;
 vm_offset_t vm_max_kernel_address;
 vm_paddr_t kernel_l1pa;
 
 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock;
 
 /*
  *  Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
 int pv_maxchunks;			/* How many chunks we have KVA for */
 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
 
 vm_paddr_t first_managed_pa;
 #define	pa_to_pvh(pa)	(&pv_table[pte1_index(pa - first_managed_pa)])
 
 /*
  *  All those kernel PT submaps that BSD is so fond of
  */
 struct sysmaps {
 	struct	mtx lock;
 	pt2_entry_t *CMAP1;
 	pt2_entry_t *CMAP2;
 	pt2_entry_t *CMAP3;
 	caddr_t	CADDR1;
 	caddr_t	CADDR2;
 	caddr_t	CADDR3;
 };
 static struct sysmaps sysmaps_pcpu[MAXCPU];
 static pt2_entry_t *CMAP3;
 static caddr_t CADDR3;
 caddr_t _tmppt = 0;
 
 struct msgbuf *msgbufp = NULL; /* XXX move it to machdep.c */
 
 /*
  *  Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static pt2_entry_t *PMAP1 = NULL, *PMAP2;
 static pt2_entry_t *PADDR1 = NULL, *PADDR2;
 #ifdef DDB
 static pt2_entry_t *PMAP3;
 static pt2_entry_t *PADDR3;
 static int PMAP3cpu __unused; /* for SMP only */
 #endif
 #ifdef SMP
 static int PMAP1cpu;
 static int PMAP1changedcpu;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
     &PMAP1changedcpu, 0,
     "Number of times pmap_pte2_quick changed CPU with same PMAP1");
 #endif
 static int PMAP1changed;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
     &PMAP1changed, 0,
     "Number of times pmap_pte2_quick changed PMAP1");
 static int PMAP1unchanged;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
     &PMAP1unchanged, 0,
     "Number of times pmap_pte2_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
 static __inline void pt2_wirecount_init(vm_page_t m);
 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p,
     vm_offset_t va);
 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size);
 
 /*
  *  Function to set the debug level of the pmap code.
  */
 #ifdef PMAP_DEBUG
 void
 pmap_debug(int level)
 {
 
 	pmap_debug_level = level;
 	dprintf("pmap_debug: level=%d\n", pmap_debug_level);
 }
 #endif /* PMAP_DEBUG */
 
 /*
  *  This table must corespond with memory attribute configuration in vm.h.
  *  First entry is used for normal system mapping.
  *
  *  Device memory is always marked as shared.
  *  Normal memory is shared only in SMP .
  *  Not outer shareable bits are not used yet.
  *  Class 6 cannot be used on ARM11.
  */
 #define TEXDEF_TYPE_SHIFT	0
 #define TEXDEF_TYPE_MASK	0x3
 #define TEXDEF_INNER_SHIFT	2
 #define TEXDEF_INNER_MASK	0x3
 #define TEXDEF_OUTER_SHIFT	4
 #define TEXDEF_OUTER_MASK	0x3
 #define TEXDEF_NOS_SHIFT	6
 #define TEXDEF_NOS_MASK		0x1
 
 #define TEX(t, i, o, s) 			\
 		((t) << TEXDEF_TYPE_SHIFT) |	\
 		((i) << TEXDEF_INNER_SHIFT) |	\
 		((o) << TEXDEF_OUTER_SHIFT | 	\
 		((s) << TEXDEF_NOS_SHIFT))
 
 static uint32_t tex_class[8] = {
 /*	    type      inner cache outer cache */
 	TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0),  /* 0 - ATTR_WB_WA	*/
 	TEX(PRRR_MEM, NMRR_NC,	  NMRR_NC,    0),  /* 1 - ATTR_NOCACHE	*/
 	TEX(PRRR_DEV, NMRR_NC,	  NMRR_NC,    0),  /* 2 - ATTR_DEVICE	*/
 	TEX(PRRR_SO,  NMRR_NC,	  NMRR_NC,    0),  /* 3 - ATTR_SO	*/
 	TEX(PRRR_MEM, NMRR_WT,	  NMRR_WT,    0),  /* 4 - ATTR_WT	*/
 	TEX(PRRR_MEM, NMRR_NC,	  NMRR_NC,    0),  /* 5 - NOT USED YET	*/
 	TEX(PRRR_MEM, NMRR_NC,	  NMRR_NC,    0),  /* 6 - NOT USED YET	*/
 	TEX(PRRR_MEM, NMRR_NC,	  NMRR_NC,    0),  /* 7 - NOT USED YET	*/
 };
 #undef TEX
 
 static uint32_t pte2_attr_tab[8] = {
 	PTE2_ATTR_WB_WA,	/* 0 - VM_MEMATTR_WB_WA */
 	PTE2_ATTR_NOCACHE,	/* 1 - VM_MEMATTR_NOCACHE */
 	PTE2_ATTR_DEVICE,	/* 2 - VM_MEMATTR_DEVICE */
 	PTE2_ATTR_SO,		/* 3 - VM_MEMATTR_SO */
 	PTE2_ATTR_WT,		/* 4 - VM_MEMATTR_WRITE_THROUGH */
 	0,			/* 5 - NOT USED YET */
 	0,			/* 6 - NOT USED YET */
 	0			/* 7 - NOT USED YET */
 };
 CTASSERT(VM_MEMATTR_WB_WA == 0);
 CTASSERT(VM_MEMATTR_NOCACHE == 1);
 CTASSERT(VM_MEMATTR_DEVICE == 2);
 CTASSERT(VM_MEMATTR_SO == 3);
 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4);
 
 static inline uint32_t
 vm_memattr_to_pte2(vm_memattr_t ma)
 {
 
 	KASSERT((u_int)ma < 5, ("%s: bad vm_memattr_t %d", __func__, ma));
 	return (pte2_attr_tab[(u_int)ma]);
 }
 
 static inline uint32_t
 vm_page_pte2_attr(vm_page_t m)
 {
 
 	return (vm_memattr_to_pte2(m->md.pat_mode));
 }
 
 /*
  * Convert TEX definition entry to TTB flags.
  */
 static uint32_t
 encode_ttb_flags(int idx)
 {
 	uint32_t inner, outer, nos, reg;
 
 	inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) &
 		TEXDEF_INNER_MASK;
 	outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) &
 		TEXDEF_OUTER_MASK;
 	nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) &
 		TEXDEF_NOS_MASK;
 
 	reg = nos << 5;
 	reg |= outer << 3;
 	if (cpuinfo.coherent_walk)
 		reg |= (inner & 0x1) << 6;
 	reg |= (inner & 0x2) >> 1;
 #ifdef SMP
 	reg |= 1 << 1;
 #endif
 	return reg;
 }
 
 /*
  *  Set TEX remapping registers in current CPU.
  */
 void
 pmap_set_tex(void)
 {
 	uint32_t prrr, nmrr;
 	uint32_t type, inner, outer, nos;
 	int i;
 
 #ifdef PMAP_PTE_NOCACHE
 	/* XXX fixme */
 	if (cpuinfo.coherent_walk) {
 		pt_memattr = VM_MEMATTR_WB_WA;
 		ttb_flags = encode_ttb_flags(0);
 	}
 	else {
 		pt_memattr = VM_MEMATTR_NOCACHE;
 		ttb_flags = encode_ttb_flags(1);
 	}
 #else
 	pt_memattr = VM_MEMATTR_WB_WA;
 	ttb_flags = encode_ttb_flags(0);
 #endif
 
 	prrr = 0;
 	nmrr = 0;
 
 	/* Build remapping register from TEX classes. */
 	for (i = 0; i < 8; i++) {
 		type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) &
 			TEXDEF_TYPE_MASK;
 		inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) &
 			TEXDEF_INNER_MASK;
 		outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) &
 			TEXDEF_OUTER_MASK;
 		nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) &
 			TEXDEF_NOS_MASK;
 
 		prrr |= type  << (i * 2);
 		prrr |= nos   << (i + 24);
 		nmrr |= inner << (i * 2);
 		nmrr |= outer << (i * 2 + 16);
 	}
 	/* Add shareable bits for device memory. */
 	prrr |= PRRR_DS0 | PRRR_DS1;
 
 	/* Add shareable bits for normal memory in SMP case. */
 #ifdef SMP
 	prrr |= PRRR_NS1;
 #endif
 	cp15_prrr_set(prrr);
 	cp15_nmrr_set(nmrr);
 
 	/* Caches are disabled, so full TLB flush should be enough. */
 	tlb_flush_all_local();
 }
 
 /*
  * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words,
  * KERNBASE is mapped by first L2 page table in L2 page table page. It
  * meets same constrain due to PT2MAP being placed just under KERNBASE.
  */
 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0);
 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE);
 
 /*
  *  In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general.
  *  For now, anyhow, the following check must be fulfilled.
  */
 CTASSERT(PAGE_SIZE == PTE2_SIZE);
 /*
  *  We don't want to mess up MI code with all MMU and PMAP definitions,
  *  so some things, which depend on other ones, are defined independently.
  *  Now, it is time to check that we don't screw up something.
  */
 CTASSERT(PDRSHIFT == PTE1_SHIFT);
 /*
  *  Check L1 and L2 page table entries definitions consistency.
  */
 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1));
 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2));
 /*
  *  Check L2 page tables page consistency.
  */
 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2));
 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG);
 /*
  *  Check PT2TAB consistency.
  *  PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG.
  *  This should be done without remainder.
  */
 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG));
 
 /*
  *	A PT2MAP magic.
  *
  *  All level 2 page tables (PT2s) are mapped continuously and accordingly
  *  into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can
  *  be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page
  *  must be used together, but not necessary at once. The first PT2 in a page
  *  must map things on correctly aligned address and the others must follow
  *  in right order.
  */
 #define NB_IN_PT2TAB	(PT2TAB_ENTRIES * sizeof(pt2_entry_t))
 #define NPT2_IN_PT2TAB	(NB_IN_PT2TAB / NB_IN_PT2)
 #define NPG_IN_PT2TAB	(NB_IN_PT2TAB / PAGE_SIZE)
 
 /*
  *  Check PT2TAB consistency.
  *  NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2.
  *  NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE.
  *  The both should be done without remainder.
  */
 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2));
 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE));
 /*
  *  The implementation was made general, however, with the assumption
  *  bellow in mind. In case of another value of NPG_IN_PT2TAB,
  *  the code should be once more rechecked.
  */
 CTASSERT(NPG_IN_PT2TAB == 1);
 
 /*
  *  Get offset of PT2 in a page
  *  associated with given PT1 index.
  */
 static __inline u_int
 page_pt2off(u_int pt1_idx)
 {
 
 	return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2);
 }
 
 /*
  *  Get physical address of PT2
  *  associated with given PT2s page and PT1 index.
  */
 static __inline vm_paddr_t
 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx)
 {
 
 	return (pgpa + page_pt2off(pt1_idx));
 }
 
 /*
  *  Get first entry of PT2
  *  associated with given PT2s page and PT1 index.
  */
 static __inline pt2_entry_t *
 page_pt2(vm_offset_t pgva, u_int pt1_idx)
 {
 
 	return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx)));
 }
 
 /*
  *  Get virtual address of PT2s page (mapped in PT2MAP)
  *  which holds PT2 which holds entry which maps given virtual address.
  */
 static __inline vm_offset_t
 pt2map_pt2pg(vm_offset_t va)
 {
 
 	va &= ~(NPT2_IN_PG * PTE1_SIZE - 1);
 	return ((vm_offset_t)pt2map_entry(va));
 }
 
 /*****************************************************************************
  *
  *     THREE pmap initialization milestones exist:
  *
  *  locore.S
  *    -> fundamental init (including MMU) in ASM
  *
  *  initarm()
  *    -> fundamental init continues in C
  *    -> first available physical address is known
  *
  *    pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins)
  *      -> basic (safe) interface for physical address allocation is made
  *      -> basic (safe) interface for virtual mapping is made
  *      -> limited not SMP coherent work is possible
  *
  *    -> more fundamental init continues in C
  *    -> locks and some more things are available
  *    -> all fundamental allocations and mappings are done
  *
  *    pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins)
  *      -> phys_avail[] and virtual_avail is set
  *      -> control is passed to vm subsystem
  *      -> physical and virtual address allocation are off limit
  *      -> low level mapping functions, some SMP coherent,
  *         are available, which cannot be used before vm subsystem
  *         is being inited
  *
  *  mi_startup()
  *    -> vm subsystem is being inited
  *
  *      pmap_init() -> THIRD PMAP MILESTONE (third epoch begins)
  *        -> pmap is fully inited
  *
  *****************************************************************************/
 
 /*****************************************************************************
  *
  *	PMAP first stage initialization and utility functions
  *	for pre-bootstrap epoch.
  *
  *  After pmap_bootstrap_prepare() is called, the following functions
  *  can be used:
  *
  *  (1) strictly only for this stage functions for physical page allocations,
  *      virtual space allocations, and mappings:
  *
  *  vm_paddr_t pmap_preboot_get_pages(u_int num);
  *  void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num);
  *  vm_offset_t pmap_preboot_reserve_pages(u_int num);
  *  vm_offset_t pmap_preboot_get_vpages(u_int num);
  *  void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size,
  *      vm_prot_t prot, vm_memattr_t attr);
  *
  *  (2) for all stages:
  *
  *  vm_paddr_t pmap_kextract(vm_offset_t va);
  *
  *  NOTE: This is not SMP coherent stage.
  *
  *****************************************************************************/
 
 #define KERNEL_P2V(pa) \
     ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR))
 #define KERNEL_V2P(va) \
     ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr))
 
 static vm_paddr_t last_paddr;
 
 /*
  *  Pre-bootstrap epoch page allocator.
  */
 vm_paddr_t
 pmap_preboot_get_pages(u_int num)
 {
 	vm_paddr_t ret;
 
 	ret = last_paddr;
 	last_paddr += num * PAGE_SIZE;
 
 	return (ret);
 }
 
 /*
  *	The fundamental initalization of PMAP stuff.
  *
  *  Some things already happened in locore.S and some things could happen
  *  before pmap_bootstrap_prepare() is called, so let's recall what is done:
  *  1. Caches are disabled.
  *  2. We are running on virtual addresses already with 'boot_pt1'
  *     as L1 page table.
  *  3. So far, all virtual addresses can be converted to physical ones and
  *     vice versa by the following macros:
  *       KERNEL_P2V(pa) .... physical to virtual ones,
  *       KERNEL_V2P(va) .... virtual to physical ones.
  *
  *  What is done herein:
  *  1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'.
  *  2. PT2MAP magic is brought to live.
  *  3. Basic preboot functions for page allocations and mappings can be used.
  *  4. Everything is prepared for L1 cache enabling.
  *
  *  Variations:
  *  1. To use second TTB register, so kernel and users page tables will be
  *     separated. This way process forking - pmap_pinit() - could be faster,
  *     it saves physical pages and KVA per a process, and it's simple change.
  *     However, it will lead, due to hardware matter, to the following:
  *     (a) 2G space for kernel and 2G space for users.
  *     (b) 1G space for kernel in low addresses and 3G for users above it.
  *     A question is: Is the case (b) really an option? Note that case (b)
  *     does save neither physical memory and KVA.
  */
 void
 pmap_bootstrap_prepare(vm_paddr_t last)
 {
 	vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size;
 	vm_offset_t pt2pg_va;
 	pt1_entry_t *pte1p;
 	pt2_entry_t *pte2p;
 	u_int i;
 	uint32_t actlr_mask, actlr_set, l1_attr;
 
 	/*
 	 * Now, we are going to make real kernel mapping. Note that we are
 	 * already running on some mapping made in locore.S and we expect
 	 * that it's large enough to ensure nofault access to physical memory
 	 * allocated herein before switch.
 	 *
 	 * As kernel image and everything needed before are and will be mapped
 	 * by section mappings, we align last physical address to PTE1_SIZE.
 	 */
 	last_paddr = pte1_roundup(last);
 
 	/*
 	 * Allocate and zero page(s) for kernel L1 page table.
 	 *
 	 * Note that it's first allocation on space which was PTE1_SIZE
 	 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too.
 	 */
 	base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1);
 	kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1);
 	bzero((void*)kern_pt1, NB_IN_PT1);
 	pte1_sync_range(kern_pt1, NB_IN_PT1);
 
 	/* Allocate and zero page(s) for kernel PT2TAB. */
 	pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB);
 	kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa);
 	bzero(kern_pt2tab, NB_IN_PT2TAB);
 	pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB);
 
 	/* Allocate and zero page(s) for kernel L2 page tables. */
 	pt2pg_pa = pmap_preboot_get_pages(NKPT2PG);
 	pt2pg_va = KERNEL_P2V(pt2pg_pa);
 	size = NKPT2PG * PAGE_SIZE;
 	bzero((void*)pt2pg_va, size);
 	pte2_sync_range((pt2_entry_t *)pt2pg_va, size);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated pages for kernel L2 page tables so that vm_page
 	 * structures representing these pages will be created. The vm_page
 	 * structures are required for promotion of the corresponding kernel
 	 * virtual addresses to section mappings.
 	 */
 	vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0));
 
 	/*
 	 * Insert allocated L2 page table pages to PT2TAB and make
 	 * link to all PT2s in L1 page table. See how kernel_vm_end
 	 * is initialized.
 	 *
 	 * We play simple and safe. So every KVA will have underlaying
 	 * L2 page table, even kernel image mapped by sections.
 	 */
 	pte2p = kern_pt2tab_entry(KERNBASE);
 	for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE)
 		pt2tab_store(pte2p++, PTE2_KPT(pa));
 
 	pte1p = kern_pte1(KERNBASE);
 	for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2)
 		pte1_store(pte1p++, PTE1_LINK(pa));
 
 	/* Make section mappings for kernel. */
 	l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT);
 	pte1p = kern_pte1(KERNBASE);
 	for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE)
 		pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr));
 
 	/*
 	 * Get free and aligned space for PT2MAP and make L1 page table links
 	 * to L2 page tables held in PT2TAB.
 	 *
 	 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t
 	 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus
 	 * each entry in PT2TAB maps all PT2s in a page. This implies that
 	 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE.
 	 */
 	PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE);
 	pte1p = kern_pte1((vm_offset_t)PT2MAP);
 	for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) {
 		pte1_store(pte1p++, PTE1_LINK(pa));
 	}
 
 	/*
 	 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping.
 	 * Each pmap will hold own PT2TAB, so the mapping should be not global.
 	 */
 	pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP);
 	for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) {
 		pt2tab_store(pte2p++, PTE2_KPT_NG(pa));
 	}
 
 	/*
 	 * Choose correct L2 page table and make mappings for allocations
 	 * made herein which replaces temporary locore.S mappings after a while.
 	 * Note that PT2MAP cannot be used until we switch to kern_pt1.
 	 *
 	 * Note, that these allocations started aligned on 1M section and
 	 * kernel PT1 was allocated first. Making of mappings must follow
 	 * order of physical allocations as we've used KERNEL_P2V() macro
 	 * for virtual addresses resolution.
 	 */
 	pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1);
 	pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p)));
 
 	pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1));
 
 	/* Make mapping for kernel L1 page table. */
 	for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE)
 		pte2_store(pte2p++, PTE2_KPT(pa));
 
 	/* Make mapping for kernel PT2TAB. */
 	for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE)
 		pte2_store(pte2p++, PTE2_KPT(pa));
 
 	/* Finally, switch from 'boot_pt1' to 'kern_pt1'. */
 	pmap_kern_ttb = base_pt1 | ttb_flags;
 	cpuinfo_get_actlr_modifier(&actlr_mask, &actlr_set);
 	reinit_mmu(pmap_kern_ttb, actlr_mask, actlr_set);
 	/*
 	 * Initialize the first available KVA. As kernel image is mapped by
 	 * sections, we are leaving some gap behind.
 	 */
 	virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE;
 }
 
 /*
  *  Setup L2 page table page for given KVA.
  *  Used in pre-bootstrap epoch.
  *
  *  Note that we have allocated NKPT2PG pages for L2 page tables in advance
  *  and used them for mapping KVA starting from KERNBASE. However, this is not
  *  enough. Vectors and devices need L2 page tables too. Note that they are
  *  even above VM_MAX_KERNEL_ADDRESS.
  */
 static __inline vm_paddr_t
 pmap_preboot_pt2pg_setup(vm_offset_t va)
 {
 	pt2_entry_t *pte2p, pte2;
 	vm_paddr_t pt2pg_pa;
 
 	/* Get associated entry in PT2TAB. */
 	pte2p = kern_pt2tab_entry(va);
 
 	/* Just return, if PT2s page exists already. */
 	pte2 = pt2tab_load(pte2p);
 	if (pte2_is_valid(pte2))
 		return (pte2_pa(pte2));
 
 	KASSERT(va >= VM_MAX_KERNEL_ADDRESS,
 	    ("%s: NKPT2PG too small", __func__));
 
 	/*
 	 * Allocate page for PT2s and insert it to PT2TAB.
 	 * In other words, map it into PT2MAP space.
 	 */
 	pt2pg_pa = pmap_preboot_get_pages(1);
 	pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa));
 
 	/* Zero all PT2s in allocated page. */
 	bzero((void*)pt2map_pt2pg(va), PAGE_SIZE);
 	pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE);
 
 	return (pt2pg_pa);
 }
 
 /*
  *  Setup L2 page table for given KVA.
  *  Used in pre-bootstrap epoch.
  */
 static void
 pmap_preboot_pt2_setup(vm_offset_t va)
 {
 	pt1_entry_t *pte1p;
 	vm_paddr_t pt2pg_pa, pt2_pa;
 
 	/* Setup PT2's page. */
 	pt2pg_pa = pmap_preboot_pt2pg_setup(va);
 	pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va));
 
 	/* Insert PT2 to PT1. */
 	pte1p = kern_pte1(va);
 	pte1_store(pte1p, PTE1_LINK(pt2_pa));
 }
 
 /*
  *  Get L2 page entry associated with given KVA.
  *  Used in pre-bootstrap epoch.
  */
 static __inline pt2_entry_t*
 pmap_preboot_vtopte2(vm_offset_t va)
 {
 	pt1_entry_t *pte1p;
 
 	/* Setup PT2 if needed. */
 	pte1p = kern_pte1(va);
 	if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */
 		pmap_preboot_pt2_setup(va);
 
 	return (pt2map_entry(va));
 }
 
 /*
  *  Pre-bootstrap epoch page(s) mapping(s).
  */
 void
 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num)
 {
 	u_int i;
 	pt2_entry_t *pte2p;
 
 	/* Map all the pages. */
 	for (i = 0; i < num; i++) {
 		pte2p = pmap_preboot_vtopte2(va);
 		pte2_store(pte2p, PTE2_KRW(pa));
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 }
 
 /*
  *  Pre-bootstrap epoch virtual space alocator.
  */
 vm_offset_t
 pmap_preboot_reserve_pages(u_int num)
 {
 	u_int i;
 	vm_offset_t start, va;
 	pt2_entry_t *pte2p;
 
 	/* Allocate virtual space. */
 	start = va = virtual_avail;
 	virtual_avail += num * PAGE_SIZE;
 
 	/* Zero the mapping. */
 	for (i = 0; i < num; i++) {
 		pte2p = pmap_preboot_vtopte2(va);
 		pte2_store(pte2p, 0);
 		va += PAGE_SIZE;
 	}
 
 	return (start);
 }
 
 /*
  *  Pre-bootstrap epoch page(s) allocation and mapping(s).
  */
 vm_offset_t
 pmap_preboot_get_vpages(u_int num)
 {
 	vm_paddr_t  pa;
 	vm_offset_t va;
 
 	/* Allocate physical page(s). */
 	pa = pmap_preboot_get_pages(num);
 
 	/* Allocate virtual space. */
 	va = virtual_avail;
 	virtual_avail += num * PAGE_SIZE;
 
 	/* Map and zero all. */
 	pmap_preboot_map_pages(pa, va, num);
 	bzero((void *)va, num * PAGE_SIZE);
 
 	return (va);
 }
 
 /*
  *  Pre-bootstrap epoch page mapping(s) with attributes.
  */
 void
 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size,
     vm_prot_t prot, vm_memattr_t attr)
 {
 	u_int num;
 	u_int l1_attr, l1_prot, l2_prot, l2_attr;
 	pt1_entry_t *pte1p;
 	pt2_entry_t *pte2p;
 
 	l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR;
 	l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX;
 	l2_attr = vm_memattr_to_pte2(attr);
 	l1_prot = ATTR_TO_L1(l2_prot);
 	l1_attr = ATTR_TO_L1(l2_attr);
 
 	/* Map all the pages. */
 	num = round_page(size);
 	while (num > 0) {
 		if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) {
 			pte1p = kern_pte1(va);
 			pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr));
 			va += PTE1_SIZE;
 			pa += PTE1_SIZE;
 			num -= PTE1_SIZE;
 		} else {
 			pte2p = pmap_preboot_vtopte2(va);
 			pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr));
 			va += PAGE_SIZE;
 			pa += PAGE_SIZE;
 			num -= PAGE_SIZE;
 		}
 	}
 }
 
 /*
  *  Extract from the kernel page table the physical address
  *  that is mapped by the given virtual address "va".
  */
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	vm_paddr_t pa;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 
 	pte1 = pte1_load(kern_pte1(va));
 	if (pte1_is_section(pte1)) {
 		pa = pte1_pa(pte1) | (va & PTE1_OFFSET);
 	} else if (pte1_is_link(pte1)) {
 		/*
 		 * We should beware of concurrent promotion that changes
 		 * pte1 at this point. However, it's not a problem as PT2
 		 * page is preserved by promotion in PT2TAB. So even if
 		 * it happens, using of PT2MAP is still safe.
 		 *
 		 * QQQ: However, concurrent removing is a problem which
 		 *      ends in abort on PT2MAP space. Locking must be used
 		 *      to deal with this.
 		 */
 		pte2 = pte2_load(pt2map_entry(va));
 		pa = pte2_pa(pte2) | (va & PTE2_OFFSET);
 	}
 	else {
 		panic("%s: va %#x pte1 %#x", __func__, va, pte1);
 	}
 	return (pa);
 }
 
 /*
  *  Extract from the kernel page table the physical address
  *  that is mapped by the given virtual address "va". Also
  *  return L2 page table entry which maps the address.
  *
  *  This is only intended to be used for panic dumps.
  */
 vm_paddr_t
 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p)
 {
 	vm_paddr_t pa;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 
 	pte1 = pte1_load(kern_pte1(va));
 	if (pte1_is_section(pte1)) {
 		pa = pte1_pa(pte1) | (va & PTE1_OFFSET);
 		pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V;
 	} else if (pte1_is_link(pte1)) {
 		pte2 = pte2_load(pt2map_entry(va));
 		pa = pte2_pa(pte2);
 	} else {
 		pte2 = 0;
 		pa = 0;
 	}
 	if (pte2p != NULL)
 		*pte2p = pte2;
 	return (pa);
 }
 
 /*****************************************************************************
  *
  *	PMAP second stage initialization and utility functions
  *	for bootstrap epoch.
  *
  *  After pmap_bootstrap() is called, the following functions for
  *  mappings can be used:
  *
  *  void pmap_kenter(vm_offset_t va, vm_paddr_t pa);
  *  void pmap_kremove(vm_offset_t va);
  *  vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end,
  *      int prot);
  *
  *  NOTE: This is not SMP coherent stage. And physical page allocation is not
  *        allowed during this stage.
  *
  *****************************************************************************/
 
 /*
  *  Initialize kernel PMAP locks and lists, kernel_pmap itself, and
  *  reserve various virtual spaces for temporary mappings.
  */
 void
 pmap_bootstrap(vm_offset_t firstaddr)
 {
 	pt2_entry_t *unused __unused;
 	struct sysmaps *sysmaps;
 	u_int i;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_l1pa = (vm_paddr_t)kern_pt1;  /* for libkvm */
 	kernel_pmap->pm_pt1 = kern_pt1;
 	kernel_pmap->pm_pt2tab = kern_pt2tab;
 	CPU_FILL(&kernel_pmap->pm_active);  /* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
 	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	LIST_INIT(&allpmaps);
 
 	/*
 	 * Request a spin mutex so that changes to allpmaps cannot be
 	 * preempted by smp_rendezvous_cpus().
 	 */
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)  do {		\
 	v = (c)pmap_preboot_reserve_pages(n);	\
 	p = pt2map_entry((vm_offset_t)v);	\
 	} while (0)
 
 	/*
 	 * Local CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * Local CMAP3 is used for data cache cleaning.
 	 * Global CMAP3 is used for the idle process page zeroing.
 	 */
 	for (i = 0; i < MAXCPU; i++) {
 		sysmaps = &sysmaps_pcpu[i];
 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1);
 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1);
 		SYSMAP(caddr_t, sysmaps->CMAP3, sysmaps->CADDR3, 1);
 	}
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1);
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS);
 
 	/*
 	 * _tmppt is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, _tmppt, 1);
 
 	/*
 	 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(),
 	 * respectively. PADDR3 is used by pmap_pte2_ddb().
 	 */
 	SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1);
 	SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1);
 #ifdef DDB
 	SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1);
 #endif
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	/*
 	 * Note that in very short time in initarm(), we are going to
 	 * initialize phys_avail[] array and no futher page allocation
 	 * can happen after that until vm subsystem will be initialized.
 	 */
 	kernel_vm_end_new = kernel_vm_end;
 	virtual_end = vm_max_kernel_address;
 }
 
 static void
 pmap_init_qpages(void)
 {
 	struct pcpu *pc;
 	int i;
 
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
 		if (pc->pc_qmap_addr == 0)
 			panic("%s: unable to allocate KVA", __func__);
 	}
 }
 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL);
 
 /*
  *  The function can already be use in second initialization stage.
  *  As such, the function DOES NOT call pmap_growkernel() where PT2
  *  allocation can happen. So if used, be sure that PT2 for given
  *  virtual address is allocated already!
  *
  *  Add a wired page to the kva.
  *  Note: not SMP coherent.
  */
 static __inline void
 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot,
     uint32_t attr)
 {
 	pt1_entry_t *pte1p;
 	pt2_entry_t *pte2p;
 
 	pte1p = kern_pte1(va);
 	if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */
 		/*
 		 * This is a very low level function, so PT2 and particularly
 		 * PT2PG associated with given virtual address must be already
 		 * allocated. It's a pain mainly during pmap initialization
 		 * stage. However, called after pmap initialization with
 		 * virtual address not under kernel_vm_end will lead to
 		 * the same misery.
 		 */
 		if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va))))
 			panic("%s: kernel PT2 not allocated!", __func__);
 	}
 
 	pte2p = pt2map_entry(va);
 	pte2_store(pte2p, PTE2_KERN(pa, prot, attr));
 }
 
 PMAP_INLINE void
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT);
 }
 
 /*
  *  Remove a page from the kernel pagetables.
  *  Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt2_entry_t *pte2p;
 
 	pte2p = pt2map_entry(va);
 	pte2_clear(pte2p);
 }
 
 /*
  *  Share new kernel PT2PG with all pmaps.
  *  The caller is responsible for maintaining TLB consistency.
  */
 static void
 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2)
 {
 	pmap_t pmap;
 	pt2_entry_t *pte2p;
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
 		pte2p = pmap_pt2tab_entry(pmap, va);
 		pt2tab_store(pte2p, npte2);
 	}
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  *  Share new kernel PTE1 with all pmaps.
  *  The caller is responsible for maintaining TLB consistency.
  */
 static void
 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1)
 {
 	pmap_t pmap;
 	pt1_entry_t *pte1p;
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
 		pte1p = pmap_pte1(pmap, va);
 		pte1_store(pte1p, npte1);
 	}
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  *  Used to map a range of physical addresses into kernel
  *  virtual address space.
  *
  *  The value passed in '*virt' is a suggested virtual address for
  *  the mapping. Architectures which can support a direct-mapped
  *  physical to virtual region can return the appropriate address
  *  within that region, leaving '*virt' unchanged. Other
  *  architectures should map the pages starting at '*virt' and
  *  update '*virt' with the first usable address after the mapped
  *  region.
  *
  *  NOTE: Read the comments above pmap_kenter_prot_attr() as
  *        the function is used herein!
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 	vm_paddr_t pte1_offset;
 	pt1_entry_t npte1;
 	uint32_t l1prot, l2prot;
 	uint32_t l1attr, l2attr;
 
 	PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x),"
 	    " prot = %d\n", __func__, *virt, start, end, end - start,  prot));
 
 	l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR;
 	l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX;
 	l1prot = ATTR_TO_L1(l2prot);
 
 	l2attr = PTE2_ATTR_DEFAULT;
 	l1attr = ATTR_TO_L1(l2attr);
 
 	va = *virt;
 	/*
 	 * Does the physical address range's size and alignment permit at
 	 * least one section mapping to be created?
 	 */
 	pte1_offset = start & PTE1_OFFSET;
 	if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >=
 	    PTE1_SIZE) {
 		/*
 		 * Increase the starting virtual address so that its alignment
 		 * does not preclude the use of section mappings.
 		 */
 		if ((va & PTE1_OFFSET) < pte1_offset)
 			va = pte1_trunc(va) + pte1_offset;
 		else if ((va & PTE1_OFFSET) > pte1_offset)
 			va = pte1_roundup(va) + pte1_offset;
 	}
 	sva = va;
 	while (start < end) {
 		if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) {
 			KASSERT((va & PTE1_OFFSET) == 0,
 			    ("%s: misaligned va %#x", __func__, va));
 			npte1 = PTE1_KERN(start, l1prot, l1attr);
 			pmap_kenter_pte1(va, npte1);
 			va += PTE1_SIZE;
 			start += PTE1_SIZE;
 		} else {
 			pmap_kenter_prot_attr(va, start, l2prot, l2attr);
 			va += PAGE_SIZE;
 			start += PAGE_SIZE;
 		}
 	}
 	tlb_flush_range(sva, va - sva);
 	*virt = va;
 	return (sva);
 }
 
 /*
  *  Make a temporary mapping for a physical address.
  *  This is only intended to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	/* QQQ: 'i' should be less or equal to MAXDUMPPGS. */
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	tlb_flush_local(va);
 	return ((void *)crashdumpmap);
 }
 
 
 /*************************************
  *
  *  TLB & cache maintenance routines.
  *
  *************************************/
 
 /*
  *  We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_tlb_flush(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		tlb_flush(va);
 }
 
 PMAP_INLINE void
 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size)
 {
 
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		tlb_flush_range(sva, size);
 }
 
 /*
  *  Abuse the pte2 nodes for unmapped kva to thread a kva freelist through.
  *  Requirements:
  *   - Must deal with pages in order to ensure that none of the PTE2_* bits
  *     are ever set, PTE2_V in particular.
  *   - Assumes we can write to pte2s without pte2_store() atomic ops.
  *   - Assumes nothing will ever test these addresses for 0 to indicate
  *     no mapping instead of correctly checking PTE2_V.
  *   - Assumes a vm_offset_t will fit in a pte2 (true for arm).
  *  Because PTE2_V is never set, there can be no mappings to invalidate.
  */
 static vm_offset_t
 pmap_pte2list_alloc(vm_offset_t *head)
 {
 	pt2_entry_t *pte2p;
 	vm_offset_t va;
 
 	va = *head;
 	if (va == 0)
 		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
 	pte2p = pt2map_entry(va);
 	*head = *pte2p;
 	if (*head & PTE2_V)
 		panic("%s: va with PTE2_V set!", __func__);
 	*pte2p = 0;
 	return (va);
 }
 
 static void
 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va)
 {
 	pt2_entry_t *pte2p;
 
 	if (va & PTE2_V)
 		panic("%s: freeing va with PTE2_V set!", __func__);
 	pte2p = pt2map_entry(va);
 	*pte2p = *head;		/* virtual! PTE2_V is 0 though */
 	*head = va;
 }
 
 static void
 pmap_pte2list_init(vm_offset_t *head, void *base, int npages)
 {
 	int i;
 	vm_offset_t va;
 
 	*head = 0;
 	for (i = npages - 1; i >= 0; i--) {
 		va = (vm_offset_t)base + i * PAGE_SIZE;
 		pmap_pte2list_free(head, va);
 	}
 }
 
 /*****************************************************************************
  *
  *	PMAP third and final stage initialization.
  *
  *  After pmap_init() is called, PMAP subsystem is fully initialized.
  *
  *****************************************************************************/
 
 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
     "Max number of PV entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
     "Page share factor per proc");
 
 static u_long nkpt2pg = NKPT2PG;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD,
     &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s");
 
 static int sp_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &sp_enabled, 0, "Are large page mappings enabled?");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0,
     "1MB page mapping counters");
 
 static u_long pmap_pte1_demotions;
 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pte1_demotions, 0, "1MB page demotions");
 
 static u_long pmap_pte1_mappings;
 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pte1_mappings, 0, "1MB page mappings");
 
 static u_long pmap_pte1_p_failures;
 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pte1_p_failures, 0, "1MB page promotion failures");
 
 static u_long pmap_pte1_promotions;
 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pte1_promotions, 0, "1MB page promotions");
 
+static u_long pmap_pte1_kern_demotions;
+SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD,
+    &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions");
+
+static u_long pmap_pte1_kern_promotions;
+SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD,
+    &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions");
+
 static __inline ttb_entry_t
 pmap_ttb_get(pmap_t pmap)
 {
 
 	return (vtophys(pmap->pm_pt1) | ttb_flags);
 }
 
 /*
  *  Initialize a vm_page's machine-dependent fields.
  *
  *  Variations:
  *  1. Pages for L2 page tables are always not managed. So, pv_list and
  *     pt2_wirecount can share same physical space. However, proper
  *     initialization on a page alloc for page tables and reinitialization
  *     on the page free must be ensured.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	pt2_wirecount_init(m);
 	m->md.pat_mode = VM_MEMATTR_DEFAULT;
 }
 
 /*
  *  Virtualization for faster way how to zero whole page.
  */
 static __inline void
 pagezero(void *page)
 {
 
 	bzero(page, PAGE_SIZE);
 }
 
 /*
  *  Zero L2 page table page.
  *  Use same KVA as in pmap_zero_page().
  */
 static __inline vm_paddr_t
 pmap_pt2pg_zero(vm_page_t m)
 {
 	vm_paddr_t pa;
 	struct sysmaps *sysmaps;
 
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * XXX: For now, we map whole page even if it's already zero,
 	 *      to sync it even if the sync is only DSB.
 	 */
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (pte2_load(sysmaps->CMAP2) != 0)
 		panic("%s: CMAP2 busy", __func__);
 	pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(pa, PTE2_AP_KRW,
 	    vm_page_pte2_attr(m)));
 	/*  Even VM_ALLOC_ZERO request is only advisory. */
 	if ((m->flags & PG_ZERO) == 0)
 		pagezero(sysmaps->CADDR2);
 	pte2_sync_range((pt2_entry_t *)sysmaps->CADDR2, PAGE_SIZE);
 	pte2_clear(sysmaps->CMAP2);
 	tlb_flush((vm_offset_t)sysmaps->CADDR2);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 
 	return (pa);
 }
 
 /*
  *  Init just allocated page as L2 page table(s) holder
  *  and return its physical address.
  */
 static __inline vm_paddr_t
 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	vm_paddr_t pa;
 	pt2_entry_t *pte2p;
 
 	/* Check page attributes. */
 	if (m->md.pat_mode != pt_memattr)
 		pmap_page_set_memattr(m, pt_memattr);
 
 	/* Zero page and init wire counts. */
 	pa = pmap_pt2pg_zero(m);
 	pt2_wirecount_init(m);
 
 	/*
 	 * Map page to PT2MAP address space for given pmap.
 	 * Note that PT2MAP space is shared with all pmaps.
 	 */
 	if (pmap == kernel_pmap)
 		pmap_kenter_pt2tab(va, PTE2_KPT(pa));
 	else {
 		pte2p = pmap_pt2tab_entry(pmap, va);
 		pt2tab_store(pte2p, PTE2_KPT_NG(pa));
 	}
 
 	return (pa);
 }
 
 /*
  *  Initialize the pmap module.
  *  Called by vm_init, to initialize any structures that the pmap
  *  system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	vm_size_t s;
 	pt2_entry_t *pte2p, pte2;
 	u_int i, pte1_idx, pv_npg;
 
 	PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR));
 
 	/*
 	 * Initialize the vm page array entries for kernel pmap's
 	 * L2 page table pages allocated in advance.
 	 */
 	pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE);
 	pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE);
 	for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) {
 		vm_paddr_t pa;
 		vm_page_t m;
 
 		pte2 = pte2_load(pte2p);
 		KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__));
 
 		pa = pte2_pa(pte2);
 		m = PHYS_TO_VM_PAGE(pa);
 		KASSERT(m >= vm_page_array &&
 		    m < &vm_page_array[vm_page_array_size],
 		    ("%s: L2 page table page is out of range", __func__));
 
 		m->pindex = pte1_idx;
 		m->phys_addr = pa;
 		pte1_idx += NPT2_IN_PG;
 	}
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled);
 	if (sp_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("%s: can't assign to pagesizes[1]", __func__));
 		pagesizes[1] = PTE1_SIZE;
 	}
 
 	/*
 	 * Calculate the size of the pv head table for sections.
 	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
 	 * Note that the table is only for sections which could be promoted.
 	 */
 	first_managed_pa = pte1_trunc(vm_phys_segs[0].start);
 	pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE)
 	    - first_managed_pa) / PTE1_SIZE + 1;
 
 	/*
 	 * Allocate memory for the pv head table for sections.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 
 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
 	if (pv_chunkbase == NULL)
 		panic("%s: not enough kvm for pv chunks", __func__);
 	pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 }
 
 /*
  *  Add a list of wired pages to the kva
  *  this routine is only used for temporary
  *  kernel mappings that do not need to have
  *  page modification or references recorded.
  *  Note that old mappings are simply written
  *  over.  The page *must* be wired.
  *  Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	u_int anychanged;
 	pt2_entry_t *epte2p, *pte2p, pte2;
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	anychanged = 0;
 	pte2p = pt2map_entry(sva);
 	epte2p = pte2p + count;
 	while (pte2p < epte2p) {
 		m = *ma++;
 		pa = VM_PAGE_TO_PHYS(m);
 		pte2 = pte2_load(pte2p);
 		if ((pte2_pa(pte2) != pa) ||
 		    (pte2_attr(pte2) != vm_page_pte2_attr(m))) {
 			anychanged++;
 			pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW,
 			    vm_page_pte2_attr(m)));
 		}
 		pte2p++;
 	}
 	if (__predict_false(anychanged))
 		tlb_flush_range(sva, count * PAGE_SIZE);
 }
 
 /*
  *  This routine tears out page mappings from the
  *  kernel -- it is meant only for temporary mappings.
  *  Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	tlb_flush_range(sva, va - sva);
 }
 
 /*
  *  Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap ||
 		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace)));
 }
 
 /*
  *  If the given pmap is not the current or kernel pmap, the returned
  *  pte2 must be released by passing it to pmap_pte2_release().
  */
 static pt2_entry_t *
 pmap_pte2(pmap_t pmap, vm_offset_t va)
 {
 	pt1_entry_t pte1;
 	vm_paddr_t pt2pg_pa;
 
 	pte1 = pte1_load(pmap_pte1(pmap, va));
 	if (pte1_is_section(pte1))
 		panic("%s: attempt to map PTE1", __func__);
 	if (pte1_is_link(pte1)) {
 		/* Are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (pt2map_entry(va));
 		/* Note that L2 page table size is not equal to PAGE_SIZE. */
 		pt2pg_pa = trunc_page(pte1_link_pa(pte1));
 		mtx_lock(&PMAP2mutex);
 		if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) {
 			pte2_store(PMAP2, PTE2_KPT(pt2pg_pa));
 			tlb_flush((vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1)));
 	}
 	return (NULL);
 }
 
 /*
  *  Releases a pte2 that was obtained from pmap_pte2().
  *  Be prepared for the pte2p being NULL.
  */
 static __inline void
 pmap_pte2_release(pt2_entry_t *pte2p)
 {
 
 	if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) {
 		mtx_unlock(&PMAP2mutex);
 	}
 }
 
 /*
  *  Super fast pmap_pte2 routine best used when scanning
  *  the pv lists.  This eliminates many coarse-grained
  *  invltlb calls.  Note that many of the pv list
  *  scans are across different pmaps.  It is very wasteful
  *  to do an entire tlb flush for checking a single mapping.
  *
  *  If the given pmap is not the current pmap, pvh_global_lock
  *  must be held and curthread pinned to a CPU.
  */
 static pt2_entry_t *
 pmap_pte2_quick(pmap_t pmap, vm_offset_t va)
 {
 	pt1_entry_t pte1;
 	vm_paddr_t pt2pg_pa;
 
 	pte1 = pte1_load(pmap_pte1(pmap, va));
 	if (pte1_is_section(pte1))
 		panic("%s: attempt to map PTE1", __func__);
 	if (pte1_is_link(pte1)) {
 		/* Are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (pt2map_entry(va));
 		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0,
 		    ("%s: curthread not pinned", __func__));
 		/* Note that L2 page table size is not equal to PAGE_SIZE. */
 		pt2pg_pa = trunc_page(pte1_link_pa(pte1));
 		if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) {
 			pte2_store(PMAP1, PTE2_KPT(pt2pg_pa));
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			tlb_flush_local((vm_offset_t)PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			tlb_flush_local((vm_offset_t)PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1)));
 	}
 	return (NULL);
 }
 
 /*
  *  Routine: pmap_extract
  *  Function:
  * 	Extract the physical page address associated
  *	with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa;
 	pt1_entry_t pte1;
 	pt2_entry_t *pte2p;
 
 	PMAP_LOCK(pmap);
 	pte1 = pte1_load(pmap_pte1(pmap, va));
 	if (pte1_is_section(pte1))
 		pa = pte1_pa(pte1) | (va & PTE1_OFFSET);
 	else if (pte1_is_link(pte1)) {
 		pte2p = pmap_pte2(pmap, va);
 		pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET);
 		pmap_pte2_release(pte2p);
 	} else
 		pa = 0;
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *  Routine: pmap_extract_and_hold
  *  Function:
  *	Atomically extract and hold the physical page
  *	with the given pmap and virtual address pair
  *	if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	vm_paddr_t pa, lockpa;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2, *pte2p;
 	vm_page_t m;
 
 	lockpa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	pte1 = pte1_load(pmap_pte1(pmap, va));
 	if (pte1_is_section(pte1)) {
 		if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) {
 			pa = pte1_pa(pte1) | (va & PTE1_OFFSET);
 			if (vm_page_pa_tryrelock(pmap, pa, &lockpa))
 				goto retry;
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 	} else if (pte1_is_link(pte1)) {
 		pte2p = pmap_pte2(pmap, va);
 		pte2 = pte2_load(pte2p);
 		pmap_pte2_release(pte2p);
 		if (pte2_is_valid(pte2) &&
 		    (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) {
 			pa = pte2_pa(pte2);
 			if (vm_page_pa_tryrelock(pmap, pa, &lockpa))
 				goto retry;
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 	}
 	PA_UNLOCK_COND(lockpa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /*
  *  Grow the number of kernel L2 page table entries, if needed.
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_page_t m;
 	vm_paddr_t pt2pg_pa, pt2_pa;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 
 	PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr));
 	/*
 	 * All the time kernel_vm_end is first KVA for which underlying
 	 * L2 page table is either not allocated or linked from L1 page table
 	 * (not considering sections). Except for two possible cases:
 	 *
 	 *   (1) in the very beginning as long as pmap_growkernel() was
 	 *       not called, it could be first unused KVA (which is not
 	 *       rounded up to PTE1_SIZE),
 	 *
 	 *   (2) when all KVA space is mapped and kernel_map->max_offset
 	 *       address is not rounded up to PTE1_SIZE. (For example,
 	 *       it could be 0xFFFFFFFF.)
 	 */
 	kernel_vm_end = pte1_roundup(kernel_vm_end);
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	addr = roundup2(addr, PTE1_SIZE);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		pte1 = pte1_load(kern_pte1(kernel_vm_end));
 		if (pte1_is_valid(pte1)) {
 			kernel_vm_end += PTE1_SIZE;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * kernel_vm_end_new is used in pmap_pinit() when kernel
 		 * mappings are entered to new pmap all at once to avoid race
 		 * between pmap_kenter_pte1() and kernel_vm_end increase.
 		 * The same aplies to pmap_kenter_pt2tab().
 		 */
 		kernel_vm_end_new = kernel_vm_end + PTE1_SIZE;
 
 		pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end));
 		if (!pte2_is_valid(pte2)) {
 			/*
 			 * Install new PT2s page into kernel PT2TAB.
 			 */
 			m = vm_page_alloc(NULL,
 			    pte1_index(kernel_vm_end) & ~PT2PG_MASK,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (m == NULL)
 				panic("%s: no memory to grow kernel", __func__);
 			/*
 			 * QQQ: To link all new L2 page tables from L1 page
 			 *      table now and so pmap_kenter_pte1() them
 			 *      at once together with pmap_kenter_pt2tab()
 			 *      could be nice speed up. However,
 			 *      pmap_growkernel() does not happen so often...
 			 * QQQ: The other TTBR is another option.
 			 */
 			pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end,
 			    m);
 		} else
 			pt2pg_pa = pte2_pa(pte2);
 
 		pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end));
 		pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa));
 
 		kernel_vm_end = kernel_vm_end_new;
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;
 		}
 	}
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = vm_max_kernel_address - KERNBASE;
 
 	return (sysctl_handle_long(oidp, &ksize, 0, req));
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = vm_max_kernel_address - kernel_vm_end;
 
 	return (sysctl_handle_long(oidp, &kfree, 0, req));
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /***********************************************
  *
  *  Pmap allocation/deallocation routines.
  *
  ***********************************************/
 
 /*
  *  Initialize the pmap for the swapper process.
  */
 void
 pmap_pinit0(pmap_t pmap)
 {
 	PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap));
 
 	PMAP_LOCK_INIT(pmap);
 
 	/*
 	 * Kernel page table directory and pmap stuff around is already
 	 * initialized, we are using it right now and here. So, finish
 	 * only PMAP structures initialization for process0 ...
 	 *
 	 * Since the L1 page table and PT2TAB is shared with the kernel pmap,
 	 * which is already included in the list "allpmaps", this pmap does
 	 * not need to be inserted into that list.
 	 */
 	pmap->pm_pt1 = kern_pt1;
 	pmap->pm_pt2tab = kern_pt2tab;
 	CPU_ZERO(&pmap->pm_active);
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	CPU_SET(0, &pmap->pm_active);
 }
 
 static __inline void
 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva,
     vm_offset_t eva)
 {
 	u_int idx, count;
 
 	idx = pte1_index(sva);
 	count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t);
 	bcopy(spte1p + idx, dpte1p + idx, count);
 }
 
 static __inline void
 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva,
     vm_offset_t eva)
 {
 	u_int idx, count;
 
 	idx = pt2tab_index(sva);
 	count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t);
 	bcopy(spte2p + idx, dpte2p + idx, count);
 }
 
 /*
  *  Initialize a preallocated and zeroed pmap structure,
  *  such as one in a vmspace structure.
  */
 int
 pmap_pinit(pmap_t pmap)
 {
 	pt1_entry_t *pte1p;
 	pt2_entry_t *pte2p;
 	vm_paddr_t pa, pt2tab_pa;
 	u_int i;
 
 	PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap,
 	    pmap->pm_pt1));
 
 	/*
 	 * No need to allocate L2 page table space yet but we do need
 	 * a valid L1 page table and PT2TAB table.
 	 *
 	 * Install shared kernel mappings to these tables. It's a little
 	 * tricky as some parts of KVA are reserved for vectors, devices,
 	 * and whatever else. These parts are supposed to be above
 	 * vm_max_kernel_address. Thus two regions should be installed:
 	 *
 	 *   (1) <KERNBASE, kernel_vm_end),
 	 *   (2) <vm_max_kernel_address, 0xFFFFFFFF>.
 	 *
 	 * QQQ: The second region should be stable enough to be installed
 	 *      only once in time when the tables are allocated.
 	 * QQQ: Maybe copy of both regions at once could be faster ...
 	 * QQQ: Maybe the other TTBR is an option.
 	 *
 	 * Finally, install own PT2TAB table to these tables.
 	 */
 
 	if (pmap->pm_pt1 == NULL) {
 		pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena,
 		    NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0,
 		    pt_memattr);
 		if (pmap->pm_pt1 == NULL)
 			return (0);
 	}
 	if (pmap->pm_pt2tab == NULL) {
 		/*
 		 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page
 		 *      only, what should be the only size for 32 bit systems,
 		 *      then we could allocate it with vm_page_alloc() and all
 		 *      the stuff needed as other L2 page table pages.
 		 *      (2) Note that a process PT2TAB is special L2 page table
 		 *      page. Its mapping in kernel_arena is permanent and can
 		 *      be used no matter which process is current. Its mapping
 		 *      in PT2MAP can be used only for current process.
 		 */
 		pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena,
 		    NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr);
 		if (pmap->pm_pt2tab == NULL) {
 			/*
 			 * QQQ: As struct pmap is allocated from UMA with
 			 *      UMA_ZONE_NOFREE flag, it's important to leave
 			 *      no allocation in pmap if initialization failed.
 			 */
 			kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1,
 			    NB_IN_PT1);
 			pmap->pm_pt1 = NULL;
 			return (0);
 		}
 		/*
 		 * QQQ: Each L2 page table page vm_page_t has pindex set to
 		 *      pte1 index of virtual address mapped by this page.
 		 *      It's not valid for non kernel PT2TABs themselves.
 		 *      The pindex of these pages can not be altered because
 		 *      of the way how they are allocated now. However, it
 		 *      should not be a problem.
 		 */
 	}
 
 	mtx_lock_spin(&allpmaps_lock);
 	/*
 	 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(),
 	 * kernel_vm_end_new is used here instead of kernel_vm_end.
 	 */
 	pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE,
 	    kernel_vm_end_new - 1);
 	pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address,
 	    0xFFFFFFFF);
 	pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE,
 	    kernel_vm_end_new - 1);
 	pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address,
 	    0xFFFFFFFF);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	/*
 	 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself.
 	 * I.e. self reference mapping.  The PT2TAB is private, however mapped
 	 * into shared PT2MAP space, so the mapping should be not global.
 	 */
 	pt2tab_pa = vtophys(pmap->pm_pt2tab);
 	pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP);
 	for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) {
 		pt2tab_store(pte2p++, PTE2_KPT_NG(pa));
 	}
 
 	/* Insert PT2MAP PT2s into pmap PT1. */
 	pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP);
 	for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) {
 		pte1_store(pte1p++, PTE1_LINK(pa));
 	}
 
 	/*
 	 * Now synchronize new mapping which was made above.
 	 */
 	pte1_sync_range(pmap->pm_pt1, NB_IN_PT1);
 	pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB);
 
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 
 	return (1);
 }
 
 #ifdef INVARIANTS
 static boolean_t
 pt2tab_user_is_empty(pt2_entry_t *tab)
 {
 	u_int i, end;
 
 	end = pt2tab_index(VM_MAXUSER_ADDRESS);
 	for (i = 0; i < end; i++)
 		if (tab[i] != 0) return (FALSE);
 	return (TRUE);
 }
 #endif
 /*
  *  Release any resources held by the given physical map.
  *  Called when a pmap initialized by pmap_pinit is being released.
  *  Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 #ifdef INVARIANTS
 	vm_offset_t start, end;
 #endif
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("%s: pmap resident count %ld != 0", __func__,
 	    pmap->pm_stats.resident_count));
 	KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab),
 	    ("%s: has allocated user PT2(s)", __func__));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("%s: pmap %p is active on some CPU(s)", __func__, pmap));
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 #ifdef INVARIANTS
 	start = pte1_index(KERNBASE) * sizeof(pt1_entry_t);
 	end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t);
 	bzero((char *)pmap->pm_pt1 + start, end - start);
 
 	start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t);
 	end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t);
 	bzero((char *)pmap->pm_pt2tab + start, end - start);
 #endif
 	/*
 	 * We are leaving PT1 and PT2TAB allocated on released pmap,
 	 * so hopefully UMA vmspace_zone will always be inited with
 	 * UMA_ZONE_NOFREE flag.
 	 */
 }
 
 /*********************************************************
  *
  *  L2 table pages and their pages management routines.
  *
  *********************************************************/
 
 /*
  *  Virtual interface for L2 page table wire counting.
  *
  *  Each L2 page table in a page has own counter which counts a number of
  *  valid mappings in a table. Global page counter counts mappings in all
  *  tables in a page plus a single itself mapping in PT2TAB.
  *
  *  During a promotion we leave the associated L2 page table counter
  *  untouched, so the table (strictly speaking a page which holds it)
  *  is never freed if promoted.
  *
  *  If a page m->wire_count == 1 then no valid mappings exist in any L2 page
  *  table in the page and the page itself is only mapped in PT2TAB.
  */
 
 static __inline void
 pt2_wirecount_init(vm_page_t m)
 {
 	u_int i;
 
 	/*
 	 * Note: A page m is allocated with VM_ALLOC_WIRED flag and
 	 *       m->wire_count should be already set correctly.
 	 *       So, there is no need to set it again herein.
 	 */
 	for (i = 0; i < NPT2_IN_PG; i++)
 		m->md.pt2_wirecount[i] = 0;
 }
 
 static __inline void
 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx)
 {
 
 	/*
 	 * Note: A just modificated pte2 (i.e. already allocated)
 	 *       is acquiring one extra reference which must be
 	 *       explicitly cleared. It influences the KASSERTs herein.
 	 *       All L2 page tables in a page always belong to the same
 	 *       pmap, so we allow only one extra reference for the page.
 	 */
 	KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1),
 	    ("%s: PT2 is overflowing ...", __func__));
 	KASSERT(m->wire_count <= (NPTE2_IN_PG + 1),
 	    ("%s: PT2PG is overflowing ...", __func__));
 
 	m->wire_count++;
 	m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++;
 }
 
 static __inline void
 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx)
 {
 
 	KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0,
 	    ("%s: PT2 is underflowing ...", __func__));
 	KASSERT(m->wire_count > 1,
 	    ("%s: PT2PG is underflowing ...", __func__));
 
 	m->wire_count--;
 	m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--;
 }
 
 static __inline void
 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count)
 {
 
 	KASSERT(count <= NPTE2_IN_PT2,
 	    ("%s: invalid count %u", __func__, count));
 	KASSERT(m->wire_count >  m->md.pt2_wirecount[pte1_idx & PT2PG_MASK],
 	    ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count,
 	    m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]));
 
 	m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK];
 	m->wire_count += count;
 	m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count;
 
 	KASSERT(m->wire_count <= (NPTE2_IN_PG + 1),
 	    ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count));
 }
 
 static __inline uint32_t
 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx)
 {
 
 	return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]);
 }
 
 static __inline boolean_t
 pt2_is_empty(vm_page_t m, vm_offset_t va)
 {
 
 	return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0);
 }
 
 static __inline boolean_t
 pt2_is_full(vm_page_t m, vm_offset_t va)
 {
 
 	return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] ==
 	    NPTE2_IN_PT2);
 }
 
 static __inline boolean_t
 pt2pg_is_empty(vm_page_t m)
 {
 
 	return (m->wire_count == 1);
 }
 
 /*
  *  This routine is called if the L2 page table
  *  is not mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags)
 {
 	uint32_t pte1_idx;
 	pt1_entry_t *pte1p;
 	pt2_entry_t pte2;
 	vm_page_t  m;
 	vm_paddr_t pt2pg_pa, pt2_pa;
 
 	pte1_idx = pte1_index(va);
 	pte1p = pmap->pm_pt1 + pte1_idx;
 
 	KASSERT(pte1_load(pte1p) == 0,
 	    ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx,
 	    pte1_load(pte1p)));
 
 	pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va));
 	if (!pte2_is_valid(pte2)) {
 		/*
 		 * Install new PT2s page into pmap PT2TAB.
 		 */
 		m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 		if (m == NULL) {
 			if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(&pvh_global_lock);
 				VM_WAIT;
 				rw_wlock(&pvh_global_lock);
 				PMAP_LOCK(pmap);
 			}
 
 			/*
 			 * Indicate the need to retry.  While waiting,
 			 * the L2 page table page may have been allocated.
 			 */
 			return (NULL);
 		}
 		pmap->pm_stats.resident_count++;
 		pt2pg_pa = pmap_pt2pg_init(pmap, va, m);
 	} else {
 		pt2pg_pa = pte2_pa(pte2);
 		m = PHYS_TO_VM_PAGE(pt2pg_pa);
 	}
 
 	pt2_wirecount_inc(m, pte1_idx);
 	pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx);
 	pte1_store(pte1p, PTE1_LINK(pt2_pa));
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags)
 {
 	u_int pte1_idx;
 	pt1_entry_t *pte1p, pte1;
 	vm_page_t m;
 
 	pte1_idx = pte1_index(va);
 retry:
 	pte1p = pmap->pm_pt1 + pte1_idx;
 	pte1 = pte1_load(pte1p);
 
 	/*
 	 * This supports switching from a 1MB page to a
 	 * normal 4K page.
 	 */
 	if (pte1_is_section(pte1)) {
 		(void)pmap_demote_pte1(pmap, pte1p, va);
 		/*
 		 * Reload pte1 after demotion.
 		 *
 		 * Note: Demotion can even fail as either PT2 is not find for
 		 *       the virtual address or PT2PG can not be allocated.
 		 */
 		pte1 = pte1_load(pte1p);
 	}
 
 	/*
 	 * If the L2 page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pte1_is_link(pte1)) {
 		m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1));
 		pt2_wirecount_inc(m, pte1_idx);
 	} else  {
 		/*
 		 * Here if the PT2 isn't mapped, or if it has
 		 * been deallocated.
 		 */
 		m = _pmap_allocpte2(pmap, va, flags);
 		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
 			goto retry;
 	}
 
 	return (m);
 }
 
 static __inline void
 pmap_free_zero_pages(struct spglist *free)
 {
 	vm_page_t m;
 
 	while ((m = SLIST_FIRST(free)) != NULL) {
 		SLIST_REMOVE_HEAD(free, plinks.s.ss);
 		/* Preserve the page's PG_ZERO setting. */
 		vm_page_free_toq(m);
 	}
 }
 
 /*
  *  Schedule the specified unused L2 page table page to be freed. Specifically,
  *  add the page to the specified list of pages that will be released to the
  *  physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free)
 {
 
 	/*
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 #ifdef PMAP_DEBUG
 	pmap_zero_page_check(m);
 #endif
 	m->flags |= PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  *  Unwire L2 page tables page.
  */
 static void
 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pt1_entry_t *pte1p, opte1 __unused;
 	pt2_entry_t *pte2p;
 	uint32_t i;
 
 	KASSERT(pt2pg_is_empty(m),
 	    ("%s: pmap %p PT2PG %p wired", __func__, pmap, m));
 
 	/*
 	 * Unmap all L2 page tables in the page from L1 page table.
 	 *
 	 * QQQ: Individual L2 page tables (except the last one) can be unmapped
 	 * earlier. However, we are doing that this way.
 	 */
 	KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK),
 	    ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m));
 	pte1p = pmap->pm_pt1 + m->pindex;
 	for (i = 0; i < NPT2_IN_PG; i++, pte1p++) {
 		KASSERT(m->md.pt2_wirecount[i] == 0,
 		    ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m));
 		opte1 = pte1_load(pte1p);
 		if (pte1_is_link(opte1)) {
 			pte1_clear(pte1p);
 			/*
 			 * Flush intermediate TLB cache.
 			 */
 			pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT);
 		}
 #ifdef INVARIANTS
 		else
 			KASSERT((opte1 == 0) || pte1_is_section(opte1),
 			    ("%s: pmap %p va %#x bad pte1 %x at %u", __func__,
 			    pmap, va, opte1, i));
 #endif
 	}
 
 	/*
 	 * Unmap the page from PT2TAB.
 	 */
 	pte2p = pmap_pt2tab_entry(pmap, va);
 	(void)pt2tab_load_clear(pte2p);
 	pmap_tlb_flush(pmap, pt2map_pt2pg(va));
 
 	m->wire_count = 0;
 	pmap->pm_stats.resident_count--;
 
 	/*
 	 * This is a release store so that the ordinary store unmapping
 	 * the L2 page table page is globally performed before TLB shoot-
 	 * down is begun.
 	 */
 	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
 }
 
 /*
  *  Decrements a L2 page table page's wire count, which is used to record the
  *  number of valid page table entries within the page.  If the wire count
  *  drops to zero, then the page table page is unmapped.  Returns TRUE if the
  *  page table page was unmapped and FALSE otherwise.
  */
 static __inline boolean_t
 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 	pt2_wirecount_dec(m, pte1_index(va));
 	if (pt2pg_is_empty(m)) {
 		/*
 		 * QQQ: Wire count is zero, so whole page should be zero and
 		 *      we can set PG_ZERO flag to it.
 		 *      Note that when promotion is enabled, it takes some
 		 *      more efforts. See pmap_unwire_pt2_all() below.
 		 */
 		pmap_unwire_pt2pg(pmap, va, m);
 		pmap_add_delayed_free_list(m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  *  Drop a L2 page table page's wire count at once, which is used to record
  *  the number of valid L2 page table entries within the page. If the wire
  *  count drops to zero, then the L2 page table page is unmapped.
  */
 static __inline void
 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free)
 {
 	u_int pte1_idx = pte1_index(va);
 
 	KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK),
 		("%s: PT2 page's pindex is wrong", __func__));
 	KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx),
 	    ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count,
 	    pt2_wirecount_get(m, pte1_idx)));
 
 	/*
 	 * It's possible that the L2 page table was never used.
 	 * It happened in case that a section was created without promotion.
 	 */
 	if (pt2_is_full(m, va)) {
 		pt2_wirecount_set(m, pte1_idx, 0);
 
 		/*
 		 * QQQ: We clear L2 page table now, so when L2 page table page
 		 *      is going to be freed, we can set it PG_ZERO flag ...
 		 *      This function is called only on section mappings, so
 		 *      hopefully it's not to big overload.
 		 *
 		 * XXX: If pmap is current, existing PT2MAP mapping could be
 		 *      used for zeroing.
 		 */
 		pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2);
 	}
 #ifdef INVARIANTS
 	else
 		KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)",
 		    __func__, pt2_wirecount_get(m, pte1_idx)));
 #endif
 	if (pt2pg_is_empty(m)) {
 		pmap_unwire_pt2pg(pmap, va, m);
 		pmap_add_delayed_free_list(m, free);
 	}
 }
 
 /*
  *  After removing a L2 page table entry, this routine is used to
  *  conditionally free the page, and manage the hold/wire counts.
  */
 static boolean_t
 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pt1_entry_t pte1;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (FALSE);
 	pte1 = pte1_load(pmap_pte1(pmap, va));
 	mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1));
 	return (pmap_unwire_pt2(pmap, va, mpte, free));
 }
 
 /*************************************
  *
  *  Page management routines.
  *
  *************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
 CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
 static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE10
 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
     "Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
     "Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
     "Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail,
     0, "Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
     "Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs,
     0, "Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
     "Current number of spare pv entries");
 #endif
 
 /*
  *  Is given page managed?
  */
 static __inline boolean_t
 is_managed(vm_paddr_t pa)
 {
 	vm_offset_t pgnum;
 	vm_page_t m;
 
 	pgnum = atop(pa);
 	if (pgnum >= first_page) {
 		m = PHYS_TO_VM_PAGE(pa);
 		if (m == NULL)
 			return (FALSE);
 		if ((m->oflags & VPO_UNMANAGED) == 0)
 			return (TRUE);
 	}
 	return (FALSE);
 }
 
 static __inline boolean_t
 pte1_is_managed(pt1_entry_t pte1)
 {
 
 	return (is_managed(pte1_pa(pte1)));
 }
 
 static __inline boolean_t
 pte2_is_managed(pt2_entry_t pte2)
 {
 
 	return (is_managed(pte2_pa(pte2)));
 }
 
 /*
  *  We are in a serious low memory condition.  Resort to
  *  drastic measures to free some pages so we can allocate
  *  another pv entry chunk.
  */
 static vm_page_t
 pmap_pv_reclaim(pmap_t locked_pmap)
 {
 	struct pch newtail;
 	struct pv_chunk *pc;
 	struct md_page *pvh;
 	pt1_entry_t *pte1p;
 	pmap_t pmap;
 	pt2_entry_t *pte2p, tpte2;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint32_t inuse;
 	int bit, field, freed;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	TAILQ_INIT(&newtail);
 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
 	    SLIST_EMPTY(&free))) {
 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 		if (pmap != pc->pc_pmap) {
 			if (pmap != NULL) {
 				if (pmap != locked_pmap)
 					PMAP_UNLOCK(pmap);
 			}
 			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
 				pmap = NULL;
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = ffs(inuse) - 1;
 				pv = &pc->pc_pventry[field * 32 + bit];
 				va = pv->pv_va;
 				pte1p = pmap_pte1(pmap, va);
 				if (pte1_is_section(pte1_load(pte1p)))
 					continue;
 				pte2p = pmap_pte2(pmap, va);
 				tpte2 = pte2_load(pte2p);
 				if ((tpte2 & PTE2_W) == 0)
 					tpte2 = pte2_load_clear(pte2p);
 				pmap_pte2_release(pte2p);
 				if ((tpte2 & PTE2_W) != 0)
 					continue;
 				KASSERT(tpte2 != 0,
 				    ("pmap_pv_reclaim: pmap %p va %#x zero pte",
 				    pmap, va));
 				pmap_tlb_flush(pmap, va);
 				m = PHYS_TO_VM_PAGE(pte2_pa(tpte2));
 				if (pte2_is_dirty(tpte2))
 					vm_page_dirty(m);
 				if ((tpte2 & PTE2_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt2(pmap, va, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 			continue;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap->pm_stats.resident_count -= freed;
 		PV_STAT(pv_entry_frees += freed);
 		PV_STAT(pv_entry_spare += freed);
 		pv_entry_count -= freed;
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		for (field = 0; field < _NPCM; field++)
 			if (pc->pc_map[field] != pc_freemask[field]) {
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 
 				/*
 				 * One freed pv entry in locked_pmap is
 				 * sufficient.
 				 */
 				if (pmap == locked_pmap)
 					goto out;
 				break;
 			}
 		if (field == _NPCM) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 			pmap_qremove((vm_offset_t)pc, 1);
 			pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc);
 			break;
 		}
 	}
 out:
 	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
 	if (pmap != NULL) {
 		if (pmap != locked_pmap)
 			PMAP_UNLOCK(pmap);
 	}
 	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	pmap_free_zero_pages(&free);
 	return (m_pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 	pmap_qremove((vm_offset_t)pc, 1);
 	vm_page_unwire(m, PQ_NONE);
 	vm_page_free(m);
 	pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  *  Free the pv_entry back to the free list.
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
 			/*
 			 * 98% of the time, pc is already at the head of the
 			 * list.  If it isn't already, move it to the head.
 			 */
 			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
 			    pc)) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			return;
 		}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 /*
  *  Get a new pv_entry, allocating a block from the system
  *  when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = ffs(pc->pc_map[field]) - 1;
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 32 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			for (field = 0; field < _NPCM; field++)
 				if (pc->pc_map[field] != 0) {
 					PV_STAT(pv_entry_spare--);
 					return (pv);	/* not full, return */
 				}
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	/*
 	 * Access to the pte2list "pv_vafree" is synchronized by the pvh
 	 * global lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_pte2list_alloc() completes.
 	 */
 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = pmap_pv_reclaim(pmap);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree);
 	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 /*
  *  Create a pv entry for page at pa for
  *  (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 }
 
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			break;
 		}
 	}
 	return (pv);
 }
 
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	struct md_page *pvh;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		if (TAILQ_EMPTY(&pvh->pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 	}
 }
 
 static void
 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PTE1_OFFSET) == 0,
 	    ("pmap_pv_demote_pte1: pa is not 1mpage aligned"));
 
 	/*
 	 * Transfer the 1mpage's pv entry for this mapping to the first
 	 * page's pv list.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = pte1_trunc(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	/* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */
 	va_last = va + PTE1_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("pmap_pv_demote_pte1: page %p is not managed", m));
 		va += PAGE_SIZE;
 		pmap_insert_entry(pmap, va, m);
 	} while (va < va_last);
 }
 
 static void
 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PTE1_OFFSET) == 0,
 	    ("pmap_pv_promote_pte1: pa is not 1mpage aligned"));
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the
 	 * 1mpage's pv list.  Aside from avoiding the cost of a call
 	 * to get_pv_entry(), a transfer avoids the possibility that
 	 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim()
 	 * removes one of the mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = pte1_trunc(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	/* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */
 	va_last = va + PTE1_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 
 /*
  *  Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water &&
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  *  Create the pv entries for each of the pages within a section.
  */
 static boolean_t
 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (pv_entry_count < pv_entry_high_water &&
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
+static inline void
+pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1)
+{
+
+	/* Kill all the small mappings or the big one only. */
+	if (pte1_is_section(npte1))
+		pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE);
+	else
+		pmap_tlb_flush(pmap, pte1_trunc(va));
+}
+
 /*
+ *  Update kernel pte1 on all pmaps.
+ *
+ *  The following function is called only on one cpu with disabled interrupts.
+ *  In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way
+ *  nobody can invoke explicit hardware table walk during the update of pte1.
+ *  Unsolicited hardware table walk can still happen, invoked by speculative
+ *  data or instruction prefetch or even by speculative hardware table walk.
+ *
+ *  The break-before-make approach should be implemented here. However, it's
+ *  not so easy to do that for kernel mappings as it would be unhappy to unmap
+ *  itself unexpectedly but voluntarily.
+ */
+static void
+pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1)
+{
+	pmap_t pmap;
+	pt1_entry_t *pte1p;
+
+	/*
+	 * Get current pmap. Interrupts should be disabled here
+	 * so PCPU_GET() is done atomically.
+	 */
+	pmap = PCPU_GET(curpmap);
+	if (pmap == NULL)
+		pmap = kernel_pmap;
+
+	/*
+	 * (1) Change pte1 on current pmap.
+	 * (2) Flush all obsolete TLB entries on current CPU.
+	 * (3) Change pte1 on all pmaps.
+	 * (4) Flush all obsolete TLB entries on all CPUs in SMP case.
+	 */
+
+	pte1p = pmap_pte1(pmap, va);
+	pte1_store(pte1p, npte1);
+
+	/* Kill all the small mappings or the big one only. */
+	if (pte1_is_section(npte1)) {
+		pmap_pte1_kern_promotions++;
+		tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE);
+	} else {
+		pmap_pte1_kern_demotions++;
+		tlb_flush_local(pte1_trunc(va));
+	}
+
+	/*
+	 * In SMP case, this function is called when all cpus are at smp
+	 * rendezvous, so there is no need to use 'allpmaps_lock' lock here.
+	 * In UP case, the function is called with this lock locked.
+	 */
+	LIST_FOREACH(pmap, &allpmaps, pm_list) {
+		pte1p = pmap_pte1(pmap, va);
+		pte1_store(pte1p, npte1);
+	}
+
+#ifdef SMP
+	/* Kill all the small mappings or the big one only. */
+	if (pte1_is_section(npte1))
+		tlb_flush_range(pte1_trunc(va), PTE1_SIZE);
+	else
+		tlb_flush(pte1_trunc(va));
+#endif
+}
+
+#ifdef SMP
+struct pte1_action {
+	vm_offset_t va;
+	pt1_entry_t npte1;
+	u_int update;		/* CPU that updates the PTE1 */
+};
+
+static void
+pmap_update_pte1_action(void *arg)
+{
+	struct pte1_action *act = arg;
+
+	if (act->update == PCPU_GET(cpuid))
+		pmap_update_pte1_kernel(act->va, act->npte1);
+}
+
+/*
+ *  Change pte1 on current pmap.
+ *  Note that kernel pte1 must be changed on all pmaps.
+ *
+ *  By ARM ARM manual, the behaviour is UNPREDICABLE when two or more TLB
+ *  entries map same VA. It's a problem when either promotion or demotion
+ *  is being done. The pte1 update and appropriate TLB flush must be done
+ *  atomically in general.
+ */
+static void
+pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va,
+    pt1_entry_t npte1)
+{
+
+	if (pmap == kernel_pmap) {
+		struct pte1_action act;
+
+		sched_pin();
+		act.va = va;
+		act.npte1 = npte1;
+		act.update = PCPU_GET(cpuid);
+		smp_rendezvous_cpus(all_cpus, smp_no_rendevous_barrier,
+		    pmap_update_pte1_action, NULL, &act);
+		sched_unpin();
+	} else {
+		register_t cspr;
+
+		/*
+		 * Use break-before-make approach for changing userland
+		 * mappings. It can cause L1 translation aborts on other
+		 * cores in SMP case. So, special treatment is implemented
+		 * in pmap_fault(). Interrups are disabled here to make it
+		 * without any interruption as quick as possible.
+		 */
+		cspr = disable_interrupts(PSR_I | PSR_F);
+		pte1_clear(pte1p);
+		pmap_tlb_flush_pte1(pmap, va, npte1);
+		pte1_store(pte1p, npte1);
+		restore_interrupts(cspr);
+	}
+}
+#else
+static void
+pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va,
+    pt1_entry_t npte1)
+{
+
+	if (pmap == kernel_pmap) {
+		mtx_lock_spin(&allpmaps_lock);
+		pmap_update_pte1_kernel(va, npte1);
+		mtx_unlock_spin(&allpmaps_lock);
+	} else {
+		register_t cspr;
+
+		/*
+		 * Use break-before-make approach for changing userland
+		 * mappings. It's absolutely safe in UP case when interrupts
+		 * are disabled.
+		 */
+		cspr = disable_interrupts(PSR_I | PSR_F);
+		pte1_clear(pte1p);
+		pmap_tlb_flush_pte1(pmap, va, npte1);
+		pte1_store(pte1p, npte1);
+		restore_interrupts(cspr);
+	}
+}
+#endif
+
+/*
  *  Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are
  *  within a single page table page (PT2) to a single 1MB page mapping.
  *  For promotion to occur, two conditions must be met: (1) the 4KB page
  *  mappings must map aligned, contiguous physical memory and (2) the 4KB page
  *  mappings must have identical characteristics.
  *
  *  Managed (PG_MANAGED) mappings within the kernel address space are not
  *  promoted.  The reason is that kernel PTE1s are replicated in each pmap but
  *  pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only
  *  read the PTE1 from the kernel pmap.
  */
 static void
 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va)
 {
 	pt1_entry_t npte1;
 	pt2_entry_t *fpte2p, fpte2, fpte2_fav;
 	pt2_entry_t *pte2p, pte2;
 	vm_offset_t pteva __unused;
 	vm_page_t m __unused;
 
 	PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__,
 	    pmap, va, pte1_load(pte1p), pte1p));
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 1MB page.
 	 */
 	fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va));
-setpte1:
 	fpte2 = pte2_load(fpte2p);
 	if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) !=
 	    (PTE2_A | PTE2_V)) {
 		pmap_pte1_p_failures++;
 		CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p",
 		    __func__, va, pmap);
 		return;
 	}
 	if (pte2_is_managed(fpte2) && pmap == kernel_pmap) {
 		pmap_pte1_p_failures++;
 		CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p",
 		    __func__, va, pmap);
 		return;
 	}
 	if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) {
 		/*
 		 * When page is not modified, PTE2_RO can be set without
 		 * a TLB invalidation.
-		 *
-		 * Note: When modified bit is being set, then in hardware case,
-		 *       the TLB entry is re-read (updated) from PT2, and in
-		 *       software case (abort), the PTE2 is read from PT2 and
-		 *       TLB flushed if changed. The following cmpset() solves
-		 *       any race with setting this bit in both cases.
 		 */
-		if (!pte2_cmpset(fpte2p, fpte2, fpte2 | PTE2_RO))
-			goto setpte1;
 		fpte2 |= PTE2_RO;
+		pte2_store(fpte2p, fpte2);
 	}
 
 	/*
 	 * Examine each of the other PTE2s in the specified PT2. Abort if this
 	 * PTE2 maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE2.
 	 */
 	fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V));
 	fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */
 	for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) {
-setpte2:
 		pte2 = pte2_load(pte2p);
 		if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) {
 			pmap_pte1_p_failures++;
 			CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p",
 			    __func__, va, pmap);
 			return;
 		}
 		if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) {
 			/*
 			 * When page is not modified, PTE2_RO can be set
 			 * without a TLB invalidation. See note above.
 			 */
-			if (!pte2_cmpset(pte2p, pte2, pte2 | PTE2_RO))
-				goto setpte2;
 			pte2 |= PTE2_RO;
+			pte2_store(pte2p, pte2);
 			pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET &
 			    PTE2_FRAME);
 			CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p",
 			    __func__, pteva, pmap);
 		}
 		if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) {
 			pmap_pte1_p_failures++;
 			CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p",
 			    __func__, va, pmap);
 			return;
 		}
 
 		fpte2_fav -= PTE2_SIZE;
 	}
 	/*
 	 * The page table page in its current state will stay in PT2TAB
 	 * until the PTE1 mapping the section is demoted by pmap_demote_pte1()
 	 * or destroyed by pmap_remove_pte1().
 	 *
 	 * Note that L2 page table size is not equal to PAGE_SIZE.
 	 */
 	m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p))));
 	KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size],
 	    ("%s: PT2 page is out of range", __func__));
 	KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK),
 	    ("%s: PT2 page's pindex is wrong", __func__));
 
 	/*
-	 *  Get pte1 from pte2 format.
-	*/
+	 * Get pte1 from pte2 format.
+	 */
 	npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V;
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if (pte2_is_managed(fpte2))
 		pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1));
 
 	/*
-	 * Map the section.
+	 * Promote the mappings.
 	 */
-	if (pmap == kernel_pmap)
-		pmap_kenter_pte1(va, npte1);
-	else
-		pte1_store(pte1p, npte1);
-	/*
-	 * Flush old small mappings. We call single pmap_tlb_flush() in
-	 * pmap_demote_pte1() and pmap_remove_pte1(), so we must be sure that
-	 * no small mappings survive. We assume that given pmap is current and
-	 * don't play game with PTE2_NG.
-	 */
-	pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE);
+	pmap_change_pte1(pmap, pte1p, va, npte1);
 
 	pmap_pte1_promotions++;
 	CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p",
 	    __func__, va, pmap);
 
 	PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n",
 	    __func__, pmap, va, npte1, pte1_load(pte1p), pte1p));
 }
 
 /*
  *  Zero L2 page table page.
  */
 static __inline void
 pmap_clear_pt2(pt2_entry_t *fpte2p)
 {
 	pt2_entry_t *pte2p;
 
 	for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++)
 		pte2_clear(pte2p);
 
 }
 
 /*
  *  Removes a 1MB page mapping from the kernel pmap.
  */
 static void
 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va)
 {
 	vm_page_t m;
 	uint32_t pte1_idx;
 	pt2_entry_t *fpte2p;
 	vm_paddr_t pt2_pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	m = pmap_pt2_page(pmap, va);
 	if (m == NULL)
 		/*
 		 * QQQ: Is this function called only on promoted pte1?
 		 *      We certainly do section mappings directly
 		 *      (without promotion) in kernel !!!
 		 */
 		panic("%s: missing pt2 page", __func__);
 
 	pte1_idx = pte1_index(va);
 
 	/*
 	 * Initialize the L2 page table.
 	 */
 	fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx);
 	pmap_clear_pt2(fpte2p);
 
 	/*
 	 * Remove the mapping.
 	 */
 	pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx);
 	pmap_kenter_pte1(va, PTE1_LINK(pt2_pa));
 
 	/*
 	 * QQQ: We do not need to invalidate PT2MAP mapping
 	 * as we did not change it. I.e. the L2 page table page
 	 * was and still is mapped the same way.
 	 */
 }
 
 /*
  *  Do the things to unmap a section in a process
  */
 static void
 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva,
     struct spglist *free)
 {
 	pt1_entry_t opte1;
 	struct md_page *pvh;
 	vm_offset_t eva, va;
 	vm_page_t m;
 
 	PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva,
 	    pte1_load(pte1p), pte1p));
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PTE1_OFFSET) == 0,
 	    ("%s: sva is not 1mpage aligned", __func__));
 
 	/*
 	 * Clear and invalidate the mapping. It should occupy one and only TLB
 	 * entry. So, pmap_tlb_flush() called with aligned address should be
 	 * sufficient.
 	 */
 	opte1 = pte1_load_clear(pte1p);
 	pmap_tlb_flush(pmap, sva);
 
 	if (pte1_is_wired(opte1))
 		pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE;
 	pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE;
 	if (pte1_is_managed(opte1)) {
 		pvh = pa_to_pvh(pte1_pa(opte1));
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + PTE1_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1));
 		    va < eva; va += PAGE_SIZE, m++) {
 			if (pte1_is_dirty(opte1))
 				vm_page_dirty(m);
 			if (opte1 & PTE1_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		/*
 		 * L2 page table(s) can't be removed from kernel map as
 		 * kernel counts on it (stuff around pmap_growkernel()).
 		 */
 		 pmap_remove_kernel_pte1(pmap, pte1p, sva);
 	} else {
 		/*
 		 * Get associated L2 page table page.
 		 * It's possible that the page was never allocated.
 		 */
 		m = pmap_pt2_page(pmap, sva);
 		if (m != NULL)
 			pmap_unwire_pt2_all(pmap, sva, m, free);
 	}
 }
 
 /*
  *  Fills L2 page table page with mappings to consecutive physical pages.
  */
 static __inline void
 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2)
 {
 	pt2_entry_t *pte2p;
 
 	for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) {
 		pte2_store(pte2p, npte2);
 		npte2 += PTE2_SIZE;
 	}
 }
 
 /*
  *  Tries to demote a 1MB page mapping. If demotion fails, the
  *  1MB page mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va)
 {
 	pt1_entry_t opte1, npte1;
 	pt2_entry_t *fpte2p, npte2;
 	vm_paddr_t pt2pg_pa, pt2_pa;
 	vm_page_t m;
 	struct spglist free;
 	uint32_t pte1_idx, isnew = 0;
 
 	PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__,
 	    pmap, va, pte1_load(pte1p), pte1p));
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	opte1 = pte1_load(pte1p);
 	KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__));
 
 	if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) {
 		KASSERT(!pte1_is_wired(opte1),
 		    ("%s: PT2 page for a wired mapping is missing", __func__));
 
 		/*
 		 * Invalidate the 1MB page mapping and return
 		 * "failure" if the mapping was never accessed or the
 		 * allocation of the new page table page fails.
 		 */
 		if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL,
 		    pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ |
 		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free);
 			pmap_free_zero_pages(&free);
 			CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p",
 			    __func__, va, pmap);
 			return (FALSE);
 		}
 		if (va < VM_MAXUSER_ADDRESS)
 			pmap->pm_stats.resident_count++;
 
 		isnew = 1;
 
 		/*
 		 * We init all L2 page tables in the page even if
 		 * we are going to change everything for one L2 page
 		 * table in a while.
 		 */
 		pt2pg_pa = pmap_pt2pg_init(pmap, va, m);
 	} else {
 		if (va < VM_MAXUSER_ADDRESS) {
 			if (pt2_is_empty(m, va))
 				isnew = 1; /* Demoting section w/o promotion. */
 #ifdef INVARIANTS
 			else
 				KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire"
 				    " count %u", __func__,
 				    pt2_wirecount_get(m, pte1_index(va))));
 #endif
 		}
 	}
 
 	pt2pg_pa = VM_PAGE_TO_PHYS(m);
 	pte1_idx = pte1_index(va);
 	/*
 	 * If the pmap is current, then the PT2MAP can provide access to
 	 * the page table page (promoted L2 page tables are not unmapped).
 	 * Otherwise, temporarily map the L2 page table page (m) into
 	 * the kernel's address space at either PADDR1 or PADDR2.
 	 *
 	 * Note that L2 page table size is not equal to PAGE_SIZE.
 	 */
 	if (pmap_is_current(pmap))
 		fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx);
 	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 		if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) {
 			pte2_store(PMAP1, PTE2_KPT(pt2pg_pa));
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			tlb_flush_local((vm_offset_t)PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			tlb_flush_local((vm_offset_t)PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx);
 	} else {
 		mtx_lock(&PMAP2mutex);
 		if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) {
 			pte2_store(PMAP2, PTE2_KPT(pt2pg_pa));
 			tlb_flush((vm_offset_t)PADDR2);
 		}
 		fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx);
 	}
 	pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx);
 	npte1 = PTE1_LINK(pt2_pa);
 
 	KASSERT((opte1 & PTE1_A) != 0,
 	    ("%s: opte1 is missing PTE1_A", __func__));
 	KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM,
 	    ("%s: opte1 has PTE1_NM", __func__));
 
 	/*
 	 *  Get pte2 from pte1 format.
 	*/
 	npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V;
 
 	/*
 	 * If the L2 page table page is new, initialize it. If the mapping
 	 * has changed attributes, update the page table entries.
 	 */
 	if (isnew != 0) {
 		pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2);
 		pmap_fill_pt2(fpte2p, npte2);
 	} else if ((pte2_load(fpte2p) & PTE2_PROMOTE) !=
 		    (npte2 & PTE2_PROMOTE))
 		pmap_fill_pt2(fpte2p, npte2);
 
 	KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2),
 	    ("%s: fpte2p and npte2 map different physical addresses",
 	    __func__));
 
 	if (fpte2p == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 
 	/*
 	 * Demote the mapping. This pmap is locked. The old PTE1 has
 	 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also
 	 * has not PTE1_NM set. Thus, there is no danger of a race with
 	 * another processor changing the setting of PTE1_A and/or PTE1_NM
 	 * between the read above and the store below.
 	 */
-	if (pmap == kernel_pmap)
-		pmap_kenter_pte1(va, npte1);
-	else
-		pte1_store(pte1p, npte1);
+	pmap_change_pte1(pmap, pte1p, va, npte1);
 
 	/*
-	 * Flush old big mapping. The mapping should occupy one and only
-	 * TLB entry. So, pmap_tlb_flush() called with aligned address
-	 * should be sufficient.
-	 */
-	pmap_tlb_flush(pmap, pte1_trunc(va));
-
-	/*
 	 * Demote the pv entry. This depends on the earlier demotion
 	 * of the mapping. Specifically, the (re)creation of a per-
 	 * page pv entry might trigger the execution of pmap_pv_reclaim(),
 	 * which might reclaim a newly (re)created per-page pv entry
 	 * and destroy the associated mapping. In order to destroy
 	 * the mapping, the PTE1 must have already changed from mapping
 	 * the 1mpage to referencing the page table page.
 	 */
 	if (pte1_is_managed(opte1))
 		pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1));
 
 	pmap_pte1_demotions++;
 	CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p",
 	    __func__, va, pmap);
 
 	PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n",
 	    __func__, pmap, va, npte1, pte1_load(pte1p), pte1p));
 	return (TRUE);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	pt1_entry_t *pte1p;
 	pt2_entry_t *pte2p;
 	pt2_entry_t npte2, opte2;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte2, om;
 	boolean_t wired;
 
 	va = trunc_page(va);
 	mpte2 = NULL;
 	wired = (flags & PMAP_ENTER_WIRED) != 0;
 
 	KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__));
 	KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS,
 	    ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__,
 	    va));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte2 = pmap_allocpte2(pmap, va, flags);
 		if (mpte2 == NULL) {
 			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
 			    ("pmap_allocpte2 failed with sleep allowed"));
 			sched_unpin();
 			rw_wunlock(&pvh_global_lock);
 			PMAP_UNLOCK(pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 	}
 	pte1p = pmap_pte1(pmap, va);
 	if (pte1_is_section(pte1_load(pte1p)))
 		panic("%s: attempted on 1MB page", __func__);
 	pte2p = pmap_pte2_quick(pmap, va);
 	if (pte2p == NULL)
 		panic("%s: invalid L1 page table entry va=%#x", __func__, va);
 
 	om = NULL;
 	pa = VM_PAGE_TO_PHYS(m);
 	opte2 = pte2_load(pte2p);
 	opa = pte2_pa(opte2);
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (pte2_is_valid(opte2) && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT2 pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT2 page will be also.
 		 */
 		if (wired && !pte2_is_wired(opte2))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && pte2_is_wired(opte2))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte2 reference
 		 */
 		if (mpte2)
 			pt2_wirecount_dec(mpte2, pte1_index(va));
 		if (pte2_is_managed(opte2))
 			om = m;
 		goto validate;
 	}
 
 	/*
 	 * QQQ: We think that changing physical address on writeable mapping
 	 *      is not safe. Well, maybe on kernel address space with correct
 	 *      locking, it can make a sense. However, we have no idea why
 	 *      anyone should do that on user address space. Are we wrong?
 	 */
 	KASSERT((opa == 0) || (opa == pa) ||
 	    !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0),
 	    ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!",
 	    __func__, pmap, va, opte2, opa, pa, flags, prot));
 
 	pv = NULL;
 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (pte2_is_wired(opte2))
 			pmap->pm_stats.wired_count--;
 		if (pte2_is_managed(opte2)) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 		}
 		/*
 		 * Remove extra pte2 reference
 		 */
 		if (mpte2 != NULL)
 			pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT);
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("%s: managed mapping within the clean submap", __func__));
 		if (pv == NULL)
 			pv = get_pv_entry(pmap, FALSE);
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	} else if (pv != NULL)
 		free_pv_entry(pmap, pv);
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	npte2 = PTE2(pa, PTE2_NM, vm_page_pte2_attr(m));
 	if (prot & VM_PROT_WRITE) {
 		if (pte2_is_managed(npte2))
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 	else
 		npte2 |= PTE2_RO;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		npte2 |= PTE2_NX;
 	if (wired)
 		npte2 |= PTE2_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		npte2 |= PTE2_U;
 	if (pmap != kernel_pmap)
 		npte2 |= PTE2_NG;
 
 	/*
 	 * If the mapping or permission bits are different, we need
 	 * to update the pte2.
 	 *
 	 * QQQ: Think again and again what to do
 	 *      if the mapping is going to be changed!
 	 */
 	if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) {
 		/*
 		 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA
 		 * is set. Do it now, before the mapping is stored and made
 		 * valid for hardware table walk. If done later, there is a race
 		 * for other threads of current process in lazy loading case.
 		 * Don't do it for kernel memory which is mapped with exec
 		 * permission even if the memory isn't going to hold executable
 		 * code. The only time when icache sync is needed is after
 		 * kernel module is loaded and the relocation info is processed.
 		 * And it's done in elf_cpu_load_file().
 		 *
 		 * QQQ: (1) Does it exist any better way where
 		 *          or how to sync icache?
 		 *      (2) Now, we do it on a page basis.
 		 */
 		if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
 		    m->md.pat_mode == VM_MEMATTR_WB_WA &&
 		    (opa != pa || (opte2 & PTE2_NX)))
 			cache_icache_sync_fresh(va, pa, PAGE_SIZE);
 
 		npte2 |= PTE2_A;
 		if (flags & VM_PROT_WRITE)
 			npte2 &= ~PTE2_NM;
 		if (opte2 & PTE2_V) {
 			/* Change mapping with break-before-make approach. */
 			opte2 = pte2_load_clear(pte2p);
 			pmap_tlb_flush(pmap, va);
 			pte2_store(pte2p, npte2);
 			if (opte2 & PTE2_A) {
 				if (pte2_is_managed(opte2))
 					vm_page_aflag_set(om, PGA_REFERENCED);
 			}
 			if (pte2_is_dirty(opte2)) {
 				if (pte2_is_managed(opte2))
 					vm_page_dirty(om);
 			}
 			if (pte2_is_managed(opte2) &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		} else
 			pte2_store(pte2p, npte2);
 	}
 #if 0
 	else {
 		/*
 		 * QQQ: In time when both access and not mofified bits are
 		 *      emulated by software, this should not happen. Some
 		 *      analysis is need, if this really happen. Missing
 		 *      tlb flush somewhere could be the reason.
 		 */
 		panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap,
 		    va, opte2, npte2);
 	}
 #endif
 	/*
 	 * If both the L2 page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte2 == NULL || pt2_is_full(mpte2, va)) &&
 	    sp_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pte1(pmap, pte1p, va);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  *  Do the things to unmap a page in a process.
  */
 static int
 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va,
     struct spglist *free)
 {
 	pt2_entry_t opte2;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/* Clear and invalidate the mapping. */
 	opte2 = pte2_load_clear(pte2p);
 	pmap_tlb_flush(pmap, va);
 
 	KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x",
 	    __func__, pmap, va, opte2));
 
 	if (opte2 & PTE2_W)
 		pmap->pm_stats.wired_count -= 1;
 	pmap->pm_stats.resident_count -= 1;
 	if (pte2_is_managed(opte2)) {
 		m = PHYS_TO_VM_PAGE(pte2_pa(opte2));
 		if (pte2_is_dirty(opte2))
 			vm_page_dirty(m);
 		if (opte2 & PTE2_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt2(pmap, va, free));
 }
 
 /*
  *  Remove a single page from a process address space.
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pt2_entry_t *pte2p;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0,
 	    ("%s: curthread not pinned", __func__));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL ||
 	    !pte2_is_valid(pte2_load(pte2p)))
 		return;
 	pmap_remove_pte2(pmap, pte2p, va, free);
 }
 
 /*
  *  Remove the given range of addresses from the specified map.
  *
  *  It is assumed that the start and end are properly
  *  rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t nextva;
 	pt1_entry_t *pte1p, pte1;
 	pt2_entry_t *pte2p, pte2;
 	struct spglist free;
 
 	/*
 	 * Perform an unsynchronized read. This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	SLIST_INIT(&free);
 
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * Special handling of removing one page. A very common
 	 * operation and easy to short circuit some code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pte1 = pte1_load(pmap_pte1(pmap, sva));
 		if (pte1_is_link(pte1)) {
 			pmap_remove_page(pmap, sva, &free);
 			goto out;
 		}
 	}
 
 	for (; sva < eva; sva = nextva) {
 		/*
 		 * Calculate address for next L2 page table.
 		 */
 		nextva = pte1_trunc(sva + PTE1_SIZE);
 		if (nextva < sva)
 			nextva = eva;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pte1p = pmap_pte1(pmap, sva);
 		pte1 = pte1_load(pte1p);
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the L1 page
 		 * table is always allocated, and in kernel virtual.
 		 */
 		if (pte1 == 0)
 			continue;
 
 		if (pte1_is_section(pte1)) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + PTE1_SIZE == nextva && eva >= nextva) {
 				pmap_remove_pte1(pmap, pte1p, sva, &free);
 				continue;
 			} else if (!pmap_demote_pte1(pmap, pte1p, sva)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			}
 #ifdef INVARIANTS
 			else {
 				/* Update pte1 after demotion. */
 				pte1 = pte1_load(pte1p);
 			}
 #endif
 		}
 
 		KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p"
 		    " is not link", __func__, pmap, sva, pte1, pte1p));
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current L2 page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (nextva > eva)
 			nextva = eva;
 
 		for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva;
 		    pte2p++, sva += PAGE_SIZE) {
 			pte2 = pte2_load(pte2p);
 			if (!pte2_is_valid(pte2))
 				continue;
 			if (pmap_remove_pte2(pmap, pte2p, sva, &free))
 				break;
 		}
 	}
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt2_entry_t *pte2p, opte2;
 	pt1_entry_t *pte1p;
 	vm_offset_t va;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, va);
 		(void)pmap_demote_pte1(pmap, pte1p, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pte1p = pmap_pte1(pmap, pv->pv_va);
 		KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found "
 		    "a 1mpage in page %p's pv list", __func__, m));
 		pte2p = pmap_pte2_quick(pmap, pv->pv_va);
 		opte2 = pte2_load_clear(pte2p);
 		pmap_tlb_flush(pmap, pv->pv_va);
 		KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2",
 		    __func__, pmap, pv->pv_va));
 		if (pte2_is_wired(opte2))
 			pmap->pm_stats.wired_count--;
 		if (opte2 & PTE2_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (pte2_is_dirty(opte2))
 			vm_page_dirty(m);
 		pmap_unuse_pt2(pmap, pv->pv_va, &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  *  Just subroutine for pmap_remove_pages() to reasonably satisfy
  *  good coding style, a.k.a. 80 character line width limit hell.
  */
 static __inline void
 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv,
     struct spglist *free)
 {
 	vm_paddr_t pa;
 	vm_page_t m, mt, mpt2pg;
 	struct md_page *pvh;
 
 	pa = pte1_pa(pte1);
 	m = PHYS_TO_VM_PAGE(pa);
 
 	KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x",
 	    __func__, m, m->phys_addr, pa));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 	    m < &vm_page_array[vm_page_array_size],
 	    ("%s: bad pte1 %#x", __func__, pte1));
 
 	if (pte1_is_dirty(pte1)) {
 		for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++)
 			vm_page_dirty(mt);
 	}
 
 	pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE;
 	pvh = pa_to_pvh(pa);
 	TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 	if (TAILQ_EMPTY(&pvh->pv_list)) {
 		for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++)
 			if (TAILQ_EMPTY(&mt->md.pv_list))
 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
 	}
 	mpt2pg = pmap_pt2_page(pmap, pv->pv_va);
 	if (mpt2pg != NULL)
 		pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free);
 }
 
 /*
  *  Just subroutine for pmap_remove_pages() to reasonably satisfy
  *  good coding style, a.k.a. 80 character line width limit hell.
  */
 static __inline void
 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv,
     struct spglist *free)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
 	struct md_page *pvh;
 
 	pa = pte2_pa(pte2);
 	m = PHYS_TO_VM_PAGE(pa);
 
 	KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x",
 	    __func__, m, m->phys_addr, pa));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 	    m < &vm_page_array[vm_page_array_size],
 	    ("%s: bad pte2 %#x", __func__, pte2));
 
 	if (pte2_is_dirty(pte2))
 		vm_page_dirty(m);
 
 	pmap->pm_stats.resident_count--;
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(pa);
 		if (TAILQ_EMPTY(&pvh->pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 	}
 	pmap_unuse_pt2(pmap, pv->pv_va, free);
 }
 
 /*
  *  Remove all pages from specified address space this aids process
  *  exit speeds. Also, this code is special cased for current process
  *  only, but can have the more generic (and slightly slower) mode enabled.
  *  This is much faster than pmap_remove in the case of running down
  *  an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt1_entry_t *pte1p, pte1;
 	pt2_entry_t *pte2p, pte2;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	struct spglist free;
 	int field, idx;
 	int32_t bit;
 	uint32_t inuse, bitmask;
 	boolean_t allfree;
 
 	/*
 	 * Assert that the given pmap is only active on the current
 	 * CPU.  Unfortunately, we cannot block another CPU from
 	 * activating the pmap while this function is executing.
 	 */
 	KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace),
 	    ("%s: non-current pmap %p", __func__, pmap));
 #if defined(SMP) && defined(INVARIANTS)
 	{
 		cpuset_t other_cpus;
 
 		sched_pin();
 		other_cpus = pmap->pm_active;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		sched_unpin();
 		KASSERT(CPU_EMPTY(&other_cpus),
 		    ("%s: pmap %p active on other cpus", __func__, pmap));
 	}
 #endif
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p",
 		    __func__, pmap, pc->pc_pmap));
 		allfree = TRUE;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 			while (inuse != 0) {
 				bit = ffs(inuse) - 1;
 				bitmask = 1UL << bit;
 				idx = field * 32 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				/*
 				 * Note that we cannot remove wired pages
 				 * from a process' mapping at this time
 				 */
 				pte1p = pmap_pte1(pmap, pv->pv_va);
 				pte1 = pte1_load(pte1p);
 				if (pte1_is_section(pte1)) {
 					if (pte1_is_wired(pte1))  {
 						allfree = FALSE;
 						continue;
 					}
 					pte1_clear(pte1p);
 					pmap_remove_pte1_quick(pmap, pte1, pv,
 					    &free);
 				}
 				else if (pte1_is_link(pte1)) {
 					pte2p = pt2map_entry(pv->pv_va);
 					pte2 = pte2_load(pte2p);
 
 					if (!pte2_is_valid(pte2)) {
 						printf("%s: pmap %p va %#x "
 						    "pte2 %#x\n", __func__,
 						    pmap, pv->pv_va, pte2);
 						panic("bad pte2");
 					}
 
 					if (pte2_is_wired(pte2))   {
 						allfree = FALSE;
 						continue;
 					}
 					pte2_clear(pte2p);
 					pmap_remove_pte2_quick(pmap, pte2, pv,
 					    &free);
 				} else {
 					printf("%s: pmap %p va %#x pte1 %#x\n",
 					    __func__, pmap, pv->pv_va, pte1);
 					panic("bad pte1");
 				}
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 			}
 		}
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	tlb_flush_all_ng_local();
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  *  This code makes some *MAJOR* assumptions:
  *  1. Current pmap & pmap exists.
  *  2. Not wired.
  *  3. Read access.
  *  4. No L2 page table pages.
  *  but is *MUCH* faster than pmap_enter...
  */
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpt2pg)
 {
 	pt2_entry_t *pte2p, pte2;
 	vm_paddr_t pa;
 	struct spglist free;
 	uint32_t l2prot;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("%s: managed mapping within the clean submap", __func__));
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a L2 page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		u_int pte1_idx;
 		pt1_entry_t pte1, *pte1p;
 		vm_paddr_t pt2_pa;
 
 		/*
 		 * Get L1 page table things.
 		 */
 		pte1_idx = pte1_index(va);
 		pte1p = pmap_pte1(pmap, va);
 		pte1 = pte1_load(pte1p);
 
 		if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) {
 			/*
 			 * Each of NPT2_IN_PG L2 page tables on the page can
 			 * come here. Make sure that associated L1 page table
 			 * link is established.
 			 *
 			 * QQQ: It comes that we don't establish all links to
 			 *      L2 page tables for newly allocated L2 page
 			 *      tables page.
 			 */
 			KASSERT(!pte1_is_section(pte1),
 			    ("%s: pte1 %#x is section", __func__, pte1));
 			if (!pte1_is_link(pte1)) {
 				pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg),
 				    pte1_idx);
 				pte1_store(pte1p, PTE1_LINK(pt2_pa));
 			}
 			pt2_wirecount_inc(mpt2pg, pte1_idx);
 		} else {
 			/*
 			 * If the L2 page table page is mapped, we just
 			 * increment the hold count, and activate it.
 			 */
 			if (pte1_is_section(pte1)) {
 				return (NULL);
 			} else if (pte1_is_link(pte1)) {
 				mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1));
 				pt2_wirecount_inc(mpt2pg, pte1_idx);
 			} else {
 				mpt2pg = _pmap_allocpte2(pmap, va,
 				    PMAP_ENTER_NOSLEEP);
 				if (mpt2pg == NULL)
 					return (NULL);
 			}
 		}
 	} else {
 		mpt2pg = NULL;
 	}
 
 	/*
 	 * This call to pt2map_entry() makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte2_quick().
 	 * But that isn't as quick as pt2map_entry().
 	 */
 	pte2p = pt2map_entry(va);
 	pte2 = pte2_load(pte2p);
 	if (pte2_is_valid(pte2)) {
 		if (mpt2pg != NULL) {
 			/*
 			 * Remove extra pte2 reference
 			 */
 			pt2_wirecount_dec(mpt2pg, pte1_index(va));
 			mpt2pg = NULL;
 		}
 		return (NULL);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpt2pg != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) {
 				pmap_tlb_flush(pmap, va);
 				pmap_free_zero_pages(&free);
 			}
 
 			mpt2pg = NULL;
 		}
 		return (NULL);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	pa = VM_PAGE_TO_PHYS(m);
 	l2prot = PTE2_RO | PTE2_NM;
 	if (va < VM_MAXUSER_ADDRESS)
 		l2prot |= PTE2_U | PTE2_NG;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		l2prot |= PTE2_NX;
 	else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) {
 		/*
 		 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA
 		 * is set. QQQ: For more info, see comments in pmap_enter().
 		 */
 		cache_icache_sync_fresh(va, pa, PAGE_SIZE);
 	}
 	pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m)));
 
 	return (mpt2pg);
 }
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *  Tries to create 1MB page mapping.  Returns TRUE if successful and
  *  FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
  *  blocking, (2) a mapping already exists at the specified virtual address, or
  *  (3) a pv entry cannot be allocated without reclaiming another pv entry.
  */
 static boolean_t
 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	pt1_entry_t *pte1p;
 	vm_paddr_t pa;
 	uint32_t l1prot;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pte1p = pmap_pte1(pmap, va);
 	if (pte1_is_valid(pte1_load(pte1p))) {
 		CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__,
 		    va, pmap);
 		return (FALSE);
 	}
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) {
 			CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p",
 			    __func__, va, pmap);
 			return (FALSE);
 		}
 	}
 	/*
 	 * Increment counters.
 	 */
 	pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE;
 
 	/*
 	 * Map the section.
 	 *
 	 * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is
 	 *      made readonly?
 	 */
 	pa = VM_PAGE_TO_PHYS(m);
 	l1prot = PTE1_RO | PTE1_NM;
 	if (va < VM_MAXUSER_ADDRESS)
 		l1prot |= PTE1_U | PTE1_NG;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		l1prot |= PTE1_NX;
 	else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) {
 		/*
 		 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA
 		 * is set. QQQ: For more info, see comments in pmap_enter().
 		 */
 		cache_icache_sync_fresh(va, pa, PTE1_SIZE);
 	}
 	pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(vm_page_pte2_attr(m))));
 
 	pmap_pte1_mappings++;
 	CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va,
 	    pmap);
 	return (TRUE);
 }
 
 /*
  *  Maps a sequence of resident pages belonging to the same object.
  *  The sequence begins with the given page m_start.  This page is
  *  mapped at the given virtual address start.  Each subsequent page is
  *  mapped at a virtual address that is offset from start by the same
  *  amount as the page is offset from m_start within the object.  The
  *  last page in the sequence is the page with the largest offset from
  *  m_start that can be mapped at a virtual address less than the given
  *  virtual address end.  Not every virtual page between start and end
  *  is mapped; only those for which a resident page exists with the
  *  corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_offset_t va;
 	vm_page_t m, mpt2pg;
 	vm_pindex_t diff, psize;
 
 	PDEBUG(6, printf("%s: pmap %p start %#x end  %#x m %p prot %#x\n",
 	    __func__, pmap, start, end, m_start, prot));
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 	psize = atop(end - start);
 	mpt2pg = NULL;
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end &&
 		    m->psind == 1 && sp_enabled &&
 		    pmap_enter_pte1(pmap, va, m, prot))
 			m = &m[PTE1_SIZE / PAGE_SIZE - 1];
 		else
 			mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpt2pg);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *  This code maps large physical mmap regions into the
  *  processor address space.  Note that some shortcuts
  *  are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pt1_entry_t *pte1p;
 	vm_paddr_t pa, pte2_pa;
 	vm_page_t p;
 	vm_memattr_t pat_mode;
 	u_int l1attr, l1prot;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("%s: non-device object", __func__));
 	if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) {
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("%s: invalid page %p", __func__, p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 1MB page boundary.
 		 */
 		pte2_pa = VM_PAGE_TO_PHYS(p);
 		if (pte2_pa & PTE1_OFFSET)
 			return;
 
 		/*
 		 * Skip the first page. Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("%s: invalid page %p", __func__, p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 1MB pages.
 		 *
 		 * QQQ: Well, we are mapping a section, so same condition must
 		 * be hold like during promotion. It looks that only RW mapping
 		 * is done here, so readonly mapping must be done elsewhere.
 		 */
 		l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A;
 		l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode));
 		PMAP_LOCK(pmap);
 		for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) {
 			pte1p = pmap_pte1(pmap, addr);
 			if (!pte1_is_valid(pte1_load(pte1p))) {
 				pte1_store(pte1p, PTE1(pa, l1prot, l1attr));
 				pmap->pm_stats.resident_count += PTE1_SIZE /
 				    PAGE_SIZE;
 				pmap_pte1_mappings++;
 			}
 			/* Else continue on if the PTE1 is already valid. */
 			addr += PTE1_SIZE;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *  Do the things to protect a 1mpage in a process.
  */
 static void
 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva,
     vm_prot_t prot)
 {
 	pt1_entry_t npte1, opte1;
 	vm_offset_t eva, va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PTE1_OFFSET) == 0,
 	    ("%s: sva is not 1mpage aligned", __func__));
-retry:
+
 	opte1 = npte1 = pte1_load(pte1p);
 	if (pte1_is_managed(opte1)) {
 		eva = sva + PTE1_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1));
 		    va < eva; va += PAGE_SIZE, m++)
 			if (pte1_is_dirty(opte1))
 				vm_page_dirty(m);
 	}
 	if ((prot & VM_PROT_WRITE) == 0)
 		npte1 |= PTE1_RO | PTE1_NM;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		npte1 |= PTE1_NX;
 
 	/*
 	 * QQQ: Herein, execute permission is never set.
 	 *      It only can be cleared. So, no icache
 	 *      syncing is needed.
 	 */
 
 	if (npte1 != opte1) {
-		if (!pte1_cmpset(pte1p, opte1, npte1))
-			goto retry;
+		pte1_store(pte1p, npte1);
 		pmap_tlb_flush(pmap, sva);
 	}
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	boolean_t pv_lists_locked;
 	vm_offset_t nextva;
 	pt1_entry_t *pte1p, pte1;
 	pt2_entry_t *pte2p, opte2, npte2;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
 		return;
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = nextva) {
 		/*
 		 * Calculate address for next L2 page table.
 		 */
 		nextva = pte1_trunc(sva + PTE1_SIZE);
 		if (nextva < sva)
 			nextva = eva;
 
 		pte1p = pmap_pte1(pmap, sva);
 		pte1 = pte1_load(pte1p);
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that L1 page
 		 * page table is always allocated, and in kernel virtual.
 		 */
 		if (pte1 == 0)
 			continue;
 
 		if (pte1_is_section(pte1)) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + PTE1_SIZE == nextva && eva >= nextva) {
 				pmap_protect_pte1(pmap, pte1p, sva, prot);
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pte1(pmap, pte1p, sva)) {
 					/*
 					 * The large page mapping
 					 * was destroyed.
 					 */
 					continue;
 				}
 #ifdef INVARIANTS
 				else {
 					/* Update pte1 after demotion */
 					pte1 = pte1_load(pte1p);
 				}
 #endif
 			}
 		}
 
 		KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p"
 		    " is not link", __func__, pmap, sva, pte1, pte1p));
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current L2 page table page, or to the end of the
 		 * range being protected.
 		 */
 		if (nextva > eva)
 			nextva = eva;
 
 		for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
-retry:
+
 			opte2 = npte2 = pte2_load(pte2p);
 			if (!pte2_is_valid(opte2))
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if (pte2_is_managed(opte2) &&
 				    pte2_is_dirty(opte2)) {
 					m = PHYS_TO_VM_PAGE(pte2_pa(opte2));
 					vm_page_dirty(m);
 				}
 				npte2 |= PTE2_RO | PTE2_NM;
 			}
 
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				npte2 |= PTE2_NX;
 
 			/*
 			 * QQQ: Herein, execute permission is never set.
 			 *      It only can be cleared. So, no icache
 			 *      syncing is needed.
 			 */
 
 			if (npte2 != opte2) {
-
-				if (!pte2_cmpset(pte2p, opte2, npte2))
-					goto retry;
+				pte2_store(pte2p, npte2);
 				pmap_tlb_flush(pmap, sva);
 			}
 		}
 	}
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	pmap_pvh_wired_mappings:
  *
  *	Return the updated number "count" of managed mappings that are wired.
  */
 static int
 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 {
 	pmap_t pmap;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va));
 		if (pte1_is_section(pte1)) {
 			if (pte1_is_wired(pte1))
 				count++;
 		} else {
 			KASSERT(pte1_is_link(pte1),
 			    ("%s: pte1 %#x is not link", __func__, pte1));
 			pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va));
 			if (pte2_is_wired(pte2))
 				count++;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	return (count);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	int count;
 
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 		    count);
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  *  Returns TRUE if any of the given mappings were used to modify
  *  physical memory.  Otherwise, returns FALSE.  Both page and 1mpage
  *  mappings are supported.
  */
 static boolean_t
 pmap_is_modified_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va));
 		if (pte1_is_section(pte1)) {
 			rv = pte1_is_dirty(pte1);
 		} else {
 			KASSERT(pte1_is_link(pte1),
 			    ("%s: pte1 %#x is not link", __func__, pte1));
 			pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va));
 			rv = pte2_is_dirty(pte2);
 		}
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTE2s can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pte1 = pte1_load(pmap_pte1(pmap, addr));
 	if (pte1_is_link(pte1)) {
 		pte2 = pte2_load(pt2map_entry(addr));
 		rv = !pte2_is_valid(pte2) ;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *  Returns TRUE if any of the given mappings were referenced and FALSE
  *  otherwise. Both page and 1mpage mappings are supported.
  */
 static boolean_t
 pmap_is_referenced_pvh(struct md_page *pvh)
 {
 
 	pv_entry_t pv;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va));
 		if (pte1_is_section(pte1)) {
 			rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V);
 		} else {
 			pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va));
 			rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V);
 		}
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 #define	PMAP_TS_REFERENCED_MAX	5
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pt1_entry_t  *pte1p, opte1;
 	pt2_entry_t *pte2p;
 	vm_paddr_t pa;
 	int rtval = 0;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 	pa = VM_PAGE_TO_PHYS(m);
 	pvh = pa_to_pvh(pa);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, pv->pv_va);
 		opte1 = pte1_load(pte1p);
 		if ((opte1 & PTE1_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 256 4KB pages,
 			 * it should not be cleared every time it is tested.
 			 * Apply a simple "hash" function on the physical page
 			 * number, the virtual section number, and the pmap
 			 * address to select one 4KB page out of the 256
 			 * on which testing the reference bit will result
 			 * in clearing that bit. This function is designed
 			 * to avoid the selection of the same 4KB page
 			 * for every 1MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the section is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^
 			    (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 &&
 			    !pte1_is_wired(opte1)) {
 				pte1_clear_bit(pte1p, PTE1_A);
 				pmap_tlb_flush(pmap, pv->pv_va);
 			}
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		}
 		if (rtval >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, pv->pv_va);
 		KASSERT(pte1_is_link(pte1_load(pte1p)),
 		    ("%s: not found a link in page %p's pv list", __func__, m));
 
 		pte2p = pmap_pte2_quick(pmap, pv->pv_va);
 		if ((pte2_load(pte2p) & PTE2_A) != 0) {
 			pte2_clear_bit(pte2p, PTE2_A);
 			pmap_tlb_flush(pmap, pv->pv_va);
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
 	    PMAP_TS_REFERENCED_MAX);
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t nextva;
 	pt1_entry_t *pte1p, pte1;
 	pt2_entry_t *pte2p, pte2;
 	boolean_t pv_lists_locked;
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = nextva) {
 		nextva = pte1_trunc(sva + PTE1_SIZE);
 		if (nextva < sva)
 			nextva = eva;
 
 		pte1p = pmap_pte1(pmap, sva);
 		pte1 = pte1_load(pte1p);
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that L1 page
 		 * page table is always allocated, and in kernel virtual.
 		 */
 		if (pte1 == 0)
 			continue;
 
 		if (pte1_is_section(pte1)) {
 			if (!pte1_is_wired(pte1))
 				panic("%s: pte1 %#x not wired", __func__, pte1);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + PTE1_SIZE == nextva && eva >= nextva) {
 				pte1_clear_bit(pte1p, PTE1_W);
 				pmap->pm_stats.wired_count -= PTE1_SIZE /
 				    PAGE_SIZE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						/* Repeat sva. */
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pte1(pmap, pte1p, sva))
 					panic("%s: demotion failed", __func__);
 #ifdef INVARIANTS
 				else {
 					/* Update pte1 after demotion */
 					pte1 = pte1_load(pte1p);
 				}
 #endif
 			}
 		}
 
 		KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p"
 		    " is not link", __func__, pmap, sva, pte1, pte1p));
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current L2 page table page, or to the end of the
 		 * range being protected.
 		 */
 		if (nextva > eva)
 			nextva = eva;
 
 		for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++,
 		    sva += PAGE_SIZE) {
 			pte2 = pte2_load(pte2p);
 			if (!pte2_is_valid(pte2))
 				continue;
 			if (!pte2_is_wired(pte2))
 				panic("%s: pte2 %#x is missing PTE2_W",
 				    __func__, pte2);
 
 			/*
 			 * PTE2_W must be cleared atomically. Although the pmap
 			 * lock synchronizes access to PTE2_W, another processor
 			 * could be changing PTE2_NM and/or PTE2_A concurrently.
 			 */
 			pte2_clear_bit(pte2p, PTE2_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *  Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pt1_entry_t *pte1p;
 	pt2_entry_t *pte2p, opte2;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, va);
 		if (!(pte1_load(pte1p) & PTE1_RO))
 			(void)pmap_demote_pte1(pmap, pte1p, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, pv->pv_va);
 		KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found"
 		    " a section in page %p's pv list", __func__, m));
 		pte2p = pmap_pte2_quick(pmap, pv->pv_va);
-retry:
 		opte2 = pte2_load(pte2p);
 		if (!(opte2 & PTE2_RO)) {
-			if (!pte2_cmpset(pte2p, opte2,
-			    opte2 | (PTE2_RO | PTE2_NM)))
-				goto retry;
+			pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM);
 			if (pte2_is_dirty(opte2))
 				vm_page_dirty(m);
 			pmap_tlb_flush(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	pt1_entry_t *pte1p, opte1;
 	pt2_entry_t *pte2p, pte2;
 	vm_offset_t pdnxt;
 	vm_page_t m;
 	boolean_t pv_lists_locked;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = pte1_trunc(sva + PTE1_SIZE);
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pte1p = pmap_pte1(pmap, sva);
 		opte1 = pte1_load(pte1p);
 		if (!pte1_is_valid(opte1)) /* XXX */
 			continue;
 		else if (pte1_is_section(opte1)) {
 			if (!pte1_is_managed(opte1))
 				continue;
 			if (!pv_lists_locked) {
 				pv_lists_locked = TRUE;
 				if (!rw_try_wlock(&pvh_global_lock)) {
 					PMAP_UNLOCK(pmap);
 					goto resume;
 				}
 				sched_pin();
 			}
 			if (!pmap_demote_pte1(pmap, pte1p, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying L2 page
 			 * table is fully populated, this removal never
 			 * frees a L2 page table page.
 			 */
 			if (!pte1_is_wired(opte1)) {
 				pte2p = pmap_pte2_quick(pmap, sva);
 				KASSERT(pte2_is_valid(pte2_load(pte2p)),
 				    ("%s: invalid PTE2", __func__));
 				pmap_remove_pte2(pmap, pte2p, sva, NULL);
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++,
 		    sva += PAGE_SIZE) {
 			pte2 = pte2_load(pte2p);
 			if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2))
 				continue;
 			else if (pte2_is_dirty(pte2)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(pte2_pa(pte2));
 					vm_page_dirty(m);
 				}
 				pte2_set_bit(pte2p, PTE2_NM);
 				pte2_clear_bit(pte2p, PTE2_A);
 			} else if ((pte2 & PTE2_A) != 0)
 				pte2_clear_bit(pte2p, PTE2_A);
 			else
 				continue;
 			pmap_tlb_flush(pmap, sva);
 		}
 	}
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pt1_entry_t *pte1p, opte1;
 	pt2_entry_t *pte2p, opte2;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("%s: page %p is exclusive busy", __func__, m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM
 	 * cleared. If the object containing the page is locked and the page
 	 * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently
 	 * set.
 	 */
 	if ((m->flags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, va);
 		opte1 = pte1_load(pte1p);
 		if (!(opte1 & PTE1_RO)) {
 			if (pmap_demote_pte1(pmap, pte1p, va) &&
 			    !pte1_is_wired(opte1)) {
 				/*
 				 * Write protect the mapping to a
 				 * single page so that a subsequent
 				 * write access may repromote.
 				 */
 				va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1);
 				pte2p = pmap_pte2_quick(pmap, va);
 				opte2 = pte2_load(pte2p);
 				if ((opte2 & PTE2_V)) {
 					pte2_set_bit(pte2p, PTE2_NM | PTE2_RO);
 					vm_page_dirty(m);
 					pmap_tlb_flush(pmap, va);
 				}
 			}
 		}
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, pv->pv_va);
 		KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found"
 		    " a section in page %p's pv list", __func__, m));
 		pte2p = pmap_pte2_quick(pmap, pv->pv_va);
 		if (pte2_is_dirty(pte2_load(pte2p))) {
 			pte2_set_bit(pte2p, PTE2_NM);
 			pmap_tlb_flush(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 
 /*
  *  Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 	struct sysmaps *sysmaps;
 	vm_memattr_t oma;
 	vm_paddr_t pa;
 
 	oma = m->md.pat_mode;
 	m->md.pat_mode = ma;
 
 	CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m,
 	    VM_PAGE_TO_PHYS(m), oma, ma);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 #if 0
 	/*
 	 * If "m" is a normal page, flush it from the cache.
 	 *
 	 * First, try to find an existing mapping of the page by sf
 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
 	 * flushes the cache.
 	 */
 	if (sf_buf_invalidate_cache(m, oma))
 		return;
 #endif
 	/*
 	 * If page is not mapped by sf buffer, map the page
 	 * transient and do invalidation.
 	 */
 	if (ma != oma) {
 		pa = VM_PAGE_TO_PHYS(m);
 		sched_pin();
 		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 		mtx_lock(&sysmaps->lock);
 		if (*sysmaps->CMAP2)
 			panic("%s: CMAP2 busy", __func__);
 		pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(pa, PTE2_AP_KRW,
 		    vm_memattr_to_pte2(ma)));
 		dcache_wbinv_poc((vm_offset_t)sysmaps->CADDR2, pa, PAGE_SIZE);
 		pte2_clear(sysmaps->CMAP2);
 		tlb_flush((vm_offset_t)sysmaps->CADDR2);
 		sched_unpin();
 		mtx_unlock(&sysmaps->lock);
 	}
 }
 
 /*
  *  Miscellaneous support routines follow
  */
 
 /*
  *  Returns TRUE if the given page is mapped individually or as part of
  *  a 1mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *  Returns true if the pmap's pv is one of the first
  *  16 pvs linked to from this page.  This count may
  *  be changed upwards or downwards in the future; it
  *  is only necessary that true be returned for a small
  *  subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is not managed", __func__, m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct sysmaps *sysmaps;
 
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (pte2_load(sysmaps->CMAP2) != 0)
 		panic("%s: CMAP2 busy", __func__);
 	pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
 	    vm_page_pte2_attr(m)));
 	pagezero(sysmaps->CADDR2);
 	pte2_clear(sysmaps->CMAP2);
 	tlb_flush((vm_offset_t)sysmaps->CADDR2);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct sysmaps *sysmaps;
 
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (pte2_load(sysmaps->CMAP2) != 0)
 		panic("%s: CMAP2 busy", __func__);
 	pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
 	    vm_page_pte2_attr(m)));
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero(sysmaps->CADDR2);
 	else
 		bzero(sysmaps->CADDR2 + off, size);
 	pte2_clear(sysmaps->CMAP2);
 	tlb_flush((vm_offset_t)sysmaps->CADDR2);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	if (pte2_load(CMAP3) != 0)
 		panic("%s: CMAP3 busy", __func__);
 	sched_pin();
 	pte2_store(CMAP3, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
 	    vm_page_pte2_attr(m)));
 	pagezero(CADDR3);
 	pte2_clear(CMAP3);
 	tlb_flush((vm_offset_t)CADDR3);
 	sched_unpin();
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	struct sysmaps *sysmaps;
 
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (pte2_load(sysmaps->CMAP1) != 0)
 		panic("%s: CMAP1 busy", __func__);
 	if (pte2_load(sysmaps->CMAP2) != 0)
 		panic("%s: CMAP2 busy", __func__);
 	pte2_store(sysmaps->CMAP1, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src),
 	    PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src)));
 	pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst),
 	    PTE2_AP_KRW, vm_page_pte2_attr(dst)));
 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
 	pte2_clear(sysmaps->CMAP1);
 	tlb_flush((vm_offset_t)sysmaps->CADDR1);
 	pte2_clear(sysmaps->CMAP2);
 	tlb_flush((vm_offset_t)sysmaps->CADDR2);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	struct sysmaps *sysmaps;
 	vm_page_t a_pg, b_pg;
 	char *a_cp, *b_cp;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
 
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP1 != 0)
 		panic("pmap_copy_pages: CMAP1 busy");
 	if (*sysmaps->CMAP2 != 0)
 		panic("pmap_copy_pages: CMAP2 busy");
 	while (xfersize > 0) {
 		a_pg = ma[a_offset >> PAGE_SHIFT];
 		a_pg_offset = a_offset & PAGE_MASK;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		b_pg = mb[b_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		pte2_store(sysmaps->CMAP1, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg),
 		    PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg)));
 		tlb_flush_local((vm_offset_t)sysmaps->CADDR1);
 		pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg),
 		    PTE2_AP_KRW, vm_page_pte2_attr(b_pg)));
 		tlb_flush_local((vm_offset_t)sysmaps->CADDR2);
 		a_cp = sysmaps->CADDR1 + a_pg_offset;
 		b_cp = sysmaps->CADDR2 + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 	pte2_clear(sysmaps->CMAP1);
 	tlb_flush((vm_offset_t)sysmaps->CADDR1);
 	pte2_clear(sysmaps->CMAP2);
 	tlb_flush((vm_offset_t)sysmaps->CADDR2);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	pt2_entry_t *pte2p;
 	vm_offset_t qmap_addr;
 
 	critical_enter();
 	qmap_addr = PCPU_GET(qmap_addr);
 	pte2p = pt2map_entry(qmap_addr);
 
 	KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__));
 
 	pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
 	    vm_page_pte2_attr(m)));
 	return (qmap_addr);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 	pt2_entry_t *pte2p;
 	vm_offset_t qmap_addr;
 
 	qmap_addr = PCPU_GET(qmap_addr);
 	pte2p = pt2map_entry(qmap_addr);
 
 	KASSERT(addr == qmap_addr, ("%s: invalid address", __func__));
 	KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__));
 
 	pte2_clear(pte2p);
 	tlb_flush(qmap_addr);
 	critical_exit();
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct spglist free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t nextva;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	rw_wlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	sched_pin();
 	for (addr = src_addr; addr < end_addr; addr = nextva) {
 		pt2_entry_t *src_pte2p, *dst_pte2p;
 		vm_page_t dst_mpt2pg, src_mpt2pg;
 		pt1_entry_t src_pte1;
 		u_int pte1_idx;
 
 		KASSERT(addr < VM_MAXUSER_ADDRESS,
 		    ("%s: invalid to pmap_copy page tables", __func__));
 
 		nextva = pte1_trunc(addr + PTE1_SIZE);
 		if (nextva < addr)
 			nextva = end_addr;
 
 		pte1_idx = pte1_index(addr);
 		src_pte1 = src_pmap->pm_pt1[pte1_idx];
 		if (pte1_is_section(src_pte1)) {
 			if ((addr & PTE1_OFFSET) != 0 ||
 			    (addr + PTE1_SIZE) > end_addr)
 				continue;
 			if (dst_pmap->pm_pt1[pte1_idx] == 0 &&
 			    (!pte1_is_managed(src_pte1) ||
 			    pmap_pv_insert_pte1(dst_pmap, addr,
 			    pte1_pa(src_pte1)))) {
 				dst_pmap->pm_pt1[pte1_idx] = src_pte1 &
 				    ~PTE1_W;
 				dst_pmap->pm_stats.resident_count +=
 				    PTE1_SIZE / PAGE_SIZE;
 				pmap_pte1_mappings++;
 			}
 			continue;
 		} else if (!pte1_is_link(src_pte1))
 			continue;
 
 		src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1));
 
 		/*
 		 * We leave PT2s to be linked from PT1 even if they are not
 		 * referenced until all PT2s in a page are without reference.
 		 *
 		 * QQQ: It could be changed ...
 		 */
 #if 0 /* single_pt2_link_is_cleared */
 		KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0,
 		    ("%s: source page table page is unused", __func__));
 #else
 		if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0)
 			continue;
 #endif
 		if (nextva > end_addr)
 			nextva = end_addr;
 
 		src_pte2p = pt2map_entry(addr);
 		while (addr < nextva) {
 			pt2_entry_t temp_pte2;
 			temp_pte2 = pte2_load(src_pte2p);
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if (pte2_is_managed(temp_pte2)) {
 				dst_mpt2pg = pmap_allocpte2(dst_pmap, addr,
 				    PMAP_ENTER_NOSLEEP);
 				if (dst_mpt2pg == NULL)
 					goto out;
 				dst_pte2p = pmap_pte2_quick(dst_pmap, addr);
 				if (!pte2_is_valid(pte2_load(dst_pte2p)) &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					temp_pte2 &=  ~(PTE2_W | PTE2_A);
 					temp_pte2 |= PTE2_NM;
 					pte2_store(dst_pte2p, temp_pte2);
 					dst_pmap->pm_stats.resident_count++;
 				} else {
 					SLIST_INIT(&free);
 					if (pmap_unwire_pt2(dst_pmap, addr,
 					    dst_mpt2pg, &free)) {
 						pmap_tlb_flush(dst_pmap, addr);
 						pmap_free_zero_pages(&free);
 					}
 					goto out;
 				}
 				if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >=
 				    pt2_wirecount_get(src_mpt2pg, pte1_idx))
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte2p++;
 		}
 	}
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more section mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t pte1_offset;
 
 	if (size < PTE1_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	pte1_offset = offset & PTE1_OFFSET;
 	if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE ||
 	    (*addr & PTE1_OFFSET) == pte1_offset)
 		return;
 	if ((*addr & PTE1_OFFSET) < pte1_offset)
 		*addr = pte1_trunc(*addr) + pte1_offset;
 	else
 		*addr = pte1_roundup(*addr) + pte1_offset;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pmap, oldpmap;
 	u_int cpuid, ttb;
 
 	PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td));
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 	cpuid = PCPU_GET(cpuid);
 
 #if defined(SMP)
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 
 	ttb = pmap_ttb_get(pmap);
 
 	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	td->td_pcb->pcb_pagedir = ttb;
 	cp15_ttbr_set(ttb);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 /*
  *  Perform the pmap work for mincore.
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pt1_entry_t *pte1p, pte1;
 	pt2_entry_t *pte2p, pte2;
 	vm_paddr_t pa;
 	boolean_t managed;
 	int val;
 
 	PMAP_LOCK(pmap);
 retry:
 	pte1p = pmap_pte1(pmap, addr);
 	pte1 = pte1_load(pte1p);
 	if (pte1_is_section(pte1)) {
 		pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET));
 		managed = pte1_is_managed(pte1);
 		val = MINCORE_SUPER | MINCORE_INCORE;
 		if (pte1_is_dirty(pte1))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if (pte1 & PTE1_A)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	} else if (pte1_is_link(pte1)) {
 		pte2p = pmap_pte2(pmap, addr);
 		pte2 = pte2_load(pte2p);
 		pmap_pte2_release(pte2p);
 		pa = pte2_pa(pte2);
 		managed = pte2_is_managed(pte2);
 		val = MINCORE_INCORE;
 		if (pte2_is_dirty(pte2))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if (pte2 & PTE2_A)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	} else {
 		managed = FALSE;
 		val = 0;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa)
 {
 	vm_offset_t sva;
 	uint32_t l2attr;
 
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("%s: device mapping not page-sized", __func__));
 
 	sva = va;
 	l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE);
 	while (size != 0) {
 		pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr);
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	tlb_flush_range(sva, va - sva);
 }
 
 void
 pmap_kremove_device(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t sva;
 
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("%s: device mapping not page-sized", __func__));
 
 	sva = va;
 	while (size != 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	tlb_flush_range(sva, va - sva);
 }
 
 void
 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb)
 {
 
 	pcb->pcb_pagedir = pmap_ttb_get(pmap);
 }
 
 
 /*
  *  Clean L1 data cache range by physical address.
  *  The range must be within a single page.
  */
 static void
 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr)
 {
 	struct sysmaps *sysmaps;
 
 	KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE,
 	    ("%s: not on single page", __func__));
 
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP3)
 		panic("%s: CMAP3 busy", __func__);
 	pte2_store(sysmaps->CMAP3, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr));
 	dcache_wb_pou((vm_offset_t)sysmaps->CADDR3 + (pa & PAGE_MASK), size);
 	pte2_clear(sysmaps->CMAP3);
 	tlb_flush((vm_offset_t)sysmaps->CADDR3);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *  Sync instruction cache range which is not mapped yet.
  */
 void
 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size)
 {
 	uint32_t len, offset;
 	vm_page_t m;
 
 	/* Write back d-cache on given address range. */
 	offset = pa & PAGE_MASK;
 	for ( ; size != 0; size -= len, pa += len, offset = 0) {
 		len = min(PAGE_SIZE - offset, size);
 		m = PHYS_TO_VM_PAGE(pa);
 		KASSERT(m != NULL, ("%s: vm_page_t is null for %#x",
 		  __func__, pa));
 		pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m));
 	}
 	/*
 	 * I-cache is VIPT. Only way how to flush all virtual mappings
 	 * on given physical address is to invalidate all i-cache.
 	 */
 	icache_inv_all();
 }
 
 void
 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size)
 {
 
 	/* Write back d-cache on given address range. */
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		dcache_wb_pou(va, size);
 	} else {
 		uint32_t len, offset;
 		vm_paddr_t pa;
 		vm_page_t m;
 
 		offset = va & PAGE_MASK;
 		for ( ; size != 0; size -= len, va += len, offset = 0) {
 			pa = pmap_extract(pmap, va); /* offset is preserved */
 			len = min(PAGE_SIZE - offset, size);
 			m = PHYS_TO_VM_PAGE(pa);
 			KASSERT(m != NULL, ("%s: vm_page_t is null for %#x",
 				__func__, pa));
 			pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m));
 		}
 	}
 	/*
 	 * I-cache is VIPT. Only way how to flush all virtual mappings
 	 * on given physical address is to invalidate all i-cache.
 	 */
 	icache_inv_all();
 }
 
 /*
  *  The implementation of pmap_fault() uses IN_RANGE2() macro which
  *  depends on the fact that given range size is a power of 2.
  */
 CTASSERT(powerof2(NB_IN_PT1));
 CTASSERT(powerof2(PT2MAP_SIZE));
 
 #define IN_RANGE2(addr, start, size)	\
     ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1)))
 
 /*
  *  Handle access and R/W emulation faults.
  */
 int
 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode)
 {
 	pt1_entry_t *pte1p, pte1;
 	pt2_entry_t *pte2p, pte2;
 
 	if (pmap == NULL)
 		pmap = kernel_pmap;
 
 	/*
 	 * In kernel, we should never get abort with FAR which is in range of
 	 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here
 	 * and print out a useful abort message and even get to the debugger
 	 * otherwise it likely ends with never ending loop of aborts.
 	 */
 	if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) {
 		/*
 		 * All L1 tables should always be mapped and present.
 		 * However, we check only current one herein. For user mode,
 		 * only permission abort from malicious user is not fatal.
 		 * And alignment abort as it may have higher priority.
 		 */
 		if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) {
 			CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x",
 			    __func__, pmap, pmap->pm_pt1, far);
 			panic("%s: pm_pt1 abort", __func__);
 		}
 		return (KERN_INVALID_ADDRESS);
 	}
 	if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) {
 		/*
 		 * PT2MAP should be always mapped and present in current
 		 * L1 table. However, only existing L2 tables are mapped
 		 * in PT2MAP. For user mode, only L2 translation abort and
 		 * permission abort from malicious user is not fatal.
 		 * And alignment abort as it may have higher priority.
 		 */
 		if (!usermode || (idx != FAULT_ALIGN &&
 		    idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) {
 			CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x",
 			    __func__, pmap, PT2MAP, far);
 			panic("%s: PT2MAP abort", __func__);
 		}
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	/*
+	 * A pmap lock is used below for handling of access and R/W emulation
+	 * aborts. They were handled by atomic operations before so some
+	 * analysis of new situation is needed to answer the following question:
+	 * Is it safe to use the lock even for these aborts?
+	 *
+	 * There may happen two cases in general:
+	 *
+	 * (1) Aborts while the pmap lock is locked already - this should not
+	 * happen as pmap lock is not recursive. However, under pmap lock only
+	 * internal kernel data should be accessed and such data should be
+	 * mapped with A bit set and NM bit cleared. If double abort happens,
+	 * then a mapping of data which has caused it must be fixed. Further,
+	 * all new mappings are always made with A bit set and the bit can be
+	 * cleared only on managed mappings.
+	 *
+	 * (2) Aborts while another lock(s) is/are locked - this already can
+	 * happen. However, there is no difference here if it's either access or
+	 * R/W emulation abort, or if it's some other abort.
+	 */
+
+	PMAP_LOCK(pmap);
+#ifdef SMP
+	/*
+	 * Special treatment due to break-before-make approach done when
+	 * pte1 is updated for userland mapping during section promotion or
+	 * demotion. If not catched here, pmap_enter() can find a section
+	 * mapping on faulting address. That is not allowed.
+	 */
+	if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) {
+		PMAP_UNLOCK(pmap);
+		return (KERN_SUCCESS);
+	}
+#endif
+	/*
 	 * Accesss bits for page and section. Note that the entry
 	 * is not in TLB yet, so TLB flush is not necessary.
 	 *
 	 * QQQ: This is hardware emulation, we do not call userret()
 	 *      for aborts from user mode.
-	 *      We do not lock PMAP, so cmpset() is a need. Hopefully,
-	 *      no one removes the mapping when we are here.
 	 */
 	if (idx == FAULT_ACCESS_L2) {
 		pte2p = pt2map_entry(far);
-pte2_seta:
 		pte2 = pte2_load(pte2p);
 		if (pte2_is_valid(pte2)) {
-			if (!pte2_cmpset(pte2p, pte2, pte2 | PTE2_A)) {
-				goto pte2_seta;
-			}
+			pte2_store(pte2p, pte2 | PTE2_A);
+			PMAP_UNLOCK(pmap);
 			return (KERN_SUCCESS);
 		}
 	}
 	if (idx == FAULT_ACCESS_L1) {
 		pte1p = pmap_pte1(pmap, far);
-pte1_seta:
 		pte1 = pte1_load(pte1p);
 		if (pte1_is_section(pte1)) {
-			if (!pte1_cmpset(pte1p, pte1, pte1 | PTE1_A)) {
-				goto pte1_seta;
-			}
+			pte1_store(pte1p, pte1 | PTE1_A);
+			PMAP_UNLOCK(pmap);
 			return (KERN_SUCCESS);
 		}
 	}
 
 	/*
 	 * Handle modify bits for page and section. Note that the modify
 	 * bit is emulated by software. So PTEx_RO is software read only
 	 * bit and PTEx_NM flag is real hardware read only bit.
 	 *
 	 * QQQ: This is hardware emulation, we do not call userret()
 	 *      for aborts from user mode.
-	 *      We do not lock PMAP, so cmpset() is a need. Hopefully,
-	 *      no one removes the mapping when we are here.
 	 */
 	if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) {
 		pte2p = pt2map_entry(far);
-pte2_setrw:
 		pte2 = pte2_load(pte2p);
 		if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) &&
 		    (pte2 & PTE2_NM)) {
-			if (!pte2_cmpset(pte2p, pte2, pte2 & ~PTE2_NM)) {
-				goto pte2_setrw;
-			}
+			pte2_store(pte2p, pte2 & ~PTE2_NM);
 			tlb_flush(trunc_page(far));
+			PMAP_UNLOCK(pmap);
 			return (KERN_SUCCESS);
 		}
 	}
 	if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) {
 		pte1p = pmap_pte1(pmap, far);
-pte1_setrw:
 		pte1 = pte1_load(pte1p);
 		if (pte1_is_section(pte1) && !(pte1 & PTE1_RO) &&
 		    (pte1 & PTE1_NM)) {
-			if (!pte1_cmpset(pte1p, pte1, pte1 & ~PTE1_NM)) {
-				goto pte1_setrw;
-			}
+			pte1_store(pte1p, pte1 & ~PTE1_NM);
 			tlb_flush(pte1_trunc(far));
+			PMAP_UNLOCK(pmap);
 			return (KERN_SUCCESS);
 		}
 	}
 
 	/*
 	 * QQQ: The previous code, mainly fast handling of access and
 	 *      modify bits aborts, could be moved to ASM. Now we are
 	 *      starting to deal with not fast aborts.
 	 */
 
 #ifdef INVARIANTS
 	/*
 	 * Read an entry in PT2TAB associated with both pmap and far.
 	 * It's safe because PT2TAB is always mapped.
-	 *
-	 * QQQ: We do not lock PMAP, so false positives could happen if
-	 *      the mapping is removed concurrently.
 	 */
 	pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far));
 	if (pte2_is_valid(pte2)) {
 		/*
 		 * Now, when we know that L2 page table is allocated,
 		 * we can use PT2MAP to get L2 page table entry.
 		 */
 		pte2 = pte2_load(pt2map_entry(far));
 		if (pte2_is_valid(pte2)) {
 			/*
 			 * If L2 page table entry is valid, make sure that
 			 * L1 page table entry is valid too.  Note that we
 			 * leave L2 page entries untouched when promoted.
 			 */
 			pte1 = pte1_load(pmap_pte1(pmap, far));
 			if (!pte1_is_valid(pte1)) {
 				panic("%s: missing L1 page entry (%p, %#x)",
 				    __func__, pmap, far);
 			}
 		}
 	}
 #endif
+	PMAP_UNLOCK(pmap);
 	return (KERN_FAILURE);
 }
 
 #if defined(PMAP_DEBUG)
 /*
  *  Reusing of KVA used in pmap_zero_page function !!!
  */
 static void
 pmap_zero_page_check(vm_page_t m)
 {
 	uint32_t *p, *end;
 	struct sysmaps *sysmaps;
 
 	sched_pin();
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (pte2_load(sysmaps->CMAP2) != 0)
 		panic("%s: CMAP2 busy", __func__);
 	pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
 	    vm_page_pte2_attr(m)));
 	end = (uint32_t*)(sysmaps->CADDR2 + PAGE_SIZE);
 	for (p = (uint32_t*)sysmaps->CADDR2; p < end; p++)
 		if (*p != 0)
 			panic("%s: page %p not zero, va: %p", __func__, m,
 			    sysmaps->CADDR2);
 	pte2_clear(sysmaps->CMAP2);
 	tlb_flush((vm_offset_t)sysmaps->CADDR2);
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 int
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte2 = 0;
 	int i, j, index;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_pid != pid || p->p_vmspace == NULL)
 			continue;
 		index = 0;
 		pmap = vmspace_pmap(p->p_vmspace);
 		for (i = 0; i < NPTE1_IN_PT1; i++) {
 			pt1_entry_t pte1;
 			pt2_entry_t *pte2p, pte2;
 			vm_offset_t base, va;
 			vm_paddr_t pa;
 			vm_page_t m;
 
 			base = i << PTE1_SHIFT;
 			pte1 = pte1_load(&pmap->pm_pt1[i]);
 
 			if (pte1_is_section(pte1)) {
 				/*
 				 * QQQ: Do something here!
 				 */
 			} else if (pte1_is_link(pte1)) {
 				for (j = 0; j < NPTE2_IN_PT2; j++) {
 					va = base + (j << PAGE_SHIFT);
 					if (va >= VM_MIN_KERNEL_ADDRESS) {
 						if (index) {
 							index = 0;
 							printf("\n");
 						}
 						sx_sunlock(&allproc_lock);
 						return (npte2);
 					}
 					pte2p = pmap_pte2(pmap, va);
 					pte2 = pte2_load(pte2p);
 					pmap_pte2_release(pte2p);
 					if (!pte2_is_valid(pte2))
 						continue;
 
 					pa = pte2_pa(pte2);
 					m = PHYS_TO_VM_PAGE(pa);
 					printf("va: 0x%x, pa: 0x%x, h: %d, w:"
 					    " %d, f: 0x%x", va, pa,
 					    m->hold_count, m->wire_count,
 					    m->flags);
 					npte2++;
 					index++;
 					if (index >= 2) {
 						index = 0;
 						printf("\n");
 					} else {
 						printf(" ");
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (npte2);
 }
 
 #endif
 
 #ifdef DDB
 static pt2_entry_t *
 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va)
 {
 	pt1_entry_t pte1;
 	vm_paddr_t pt2pg_pa;
 
 	pte1 = pte1_load(pmap_pte1(pmap, va));
 	if (!pte1_is_link(pte1))
 		return (NULL);
 
 	if (pmap_is_current(pmap))
 		return (pt2map_entry(va));
 
 	/* Note that L2 page table size is not equal to PAGE_SIZE. */
 	pt2pg_pa = trunc_page(pte1_link_pa(pte1));
 	if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) {
 		pte2_store(PMAP3, PTE2_KPT(pt2pg_pa));
 #ifdef SMP
 		PMAP3cpu = PCPU_GET(cpuid);
 #endif
 		tlb_flush_local((vm_offset_t)PADDR3);
 	}
 #ifdef SMP
 	else if (PMAP3cpu != PCPU_GET(cpuid)) {
 		PMAP3cpu = PCPU_GET(cpuid);
 		tlb_flush_local((vm_offset_t)PADDR3);
 	}
 #endif
 	return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1)));
 }
 
 static void
 dump_pmap(pmap_t pmap)
 {
 
 	printf("pmap %p\n", pmap);
 	printf("  pm_pt1: %p\n", pmap->pm_pt1);
 	printf("  pm_pt2tab: %p\n", pmap->pm_pt2tab);
 	printf("  pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]);
 }
 
 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps)
 {
 
 	pmap_t pmap;
 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
 		dump_pmap(pmap);
 	}
 }
 
 static int
 pte2_class(pt2_entry_t pte2)
 {
 	int cls;
 
 	cls = (pte2 >> 2) & 0x03;
 	cls |= (pte2 >> 4) & 0x04;
 	return (cls);
 }
 
 static void
 dump_section(pmap_t pmap, uint32_t pte1_idx)
 {
 }
 
 static void
 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok)
 {
 	uint32_t i;
 	vm_offset_t va;
 	pt2_entry_t *pte2p, pte2;
 	vm_page_t m;
 
 	va = pte1_idx << PTE1_SHIFT;
 	pte2p = pmap_pte2_ddb(pmap, va);
 	for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) {
 		pte2 = pte2_load(pte2p);
 		if (pte2 == 0)
 			continue;
 		if (!pte2_is_valid(pte2)) {
 			printf(" 0x%08X: 0x%08X", va, pte2);
 			if (!invalid_ok)
 				printf(" - not valid !!!");
 			printf("\n");
 			continue;
 		}
 		m = PHYS_TO_VM_PAGE(pte2_pa(pte2));
 		printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2,
 		    pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m);
 		if (m != NULL) {
 			printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid,
 			    m->hold_count, m->wire_count, m->flags);
 		} else {
 			printf("\n");
 		}
 	}
 }
 
 static __inline boolean_t
 is_pv_chunk_space(vm_offset_t va)
 {
 
 	if ((((vm_offset_t)pv_chunkbase) <= va) &&
 	    (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks)))
 		return (TRUE);
 	return (FALSE);
 }
 
 DB_SHOW_COMMAND(pmap, pmap_pmap_print)
 {
 	/* XXX convert args. */
 	pmap_t pmap = (pmap_t)addr;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 	vm_offset_t va, eva;
 	vm_page_t m;
 	uint32_t i;
 	boolean_t invalid_ok, dump_link_ok, dump_pv_chunk;
 
 	if (have_addr) {
 		pmap_t pm;
 
 		LIST_FOREACH(pm, &allpmaps, pm_list)
 			if (pm == pmap) break;
 		if (pm == NULL) {
 			printf("given pmap %p is not in allpmaps list\n", pmap);
 			return;
 		}
 	} else
 		pmap = PCPU_GET(curpmap);
 
 	eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF;
 	dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */
 
 	printf("pmap: 0x%08X\n", (uint32_t)pmap);
 	printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP);
 	printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab);
 
 	for(i = 0; i < NPTE1_IN_PT1; i++) {
 		pte1 = pte1_load(&pmap->pm_pt1[i]);
 		if (pte1 == 0)
 			continue;
 		va = i << PTE1_SHIFT;
 		if (va >= eva)
 			break;
 
 		if (pte1_is_section(pte1)) {
 			printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1,
 			    !!(pte1 & PTE1_S), !(pte1 & PTE1_NG));
 			dump_section(pmap, i);
 		} else if (pte1_is_link(pte1)) {
 			dump_link_ok = TRUE;
 			invalid_ok = FALSE;
 			pte2 = pte2_load(pmap_pt2tab_entry(pmap, va));
 			m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1));
 			printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p",
 			    va, pte1, pte2, m);
 			if (is_pv_chunk_space(va)) {
 				printf(" - pv_chunk space");
 				if (dump_pv_chunk)
 					invalid_ok = TRUE;
 				else
 					dump_link_ok = FALSE;
 			}
 			else if (m != NULL)
 				printf(" w:%d w2:%u", m->wire_count,
 				    pt2_wirecount_get(m, pte1_index(va)));
 			if (pte2 == 0)
 				printf(" !!! pt2tab entry is ZERO");
 			else if (pte2_pa(pte1) != pte2_pa(pte2))
 				printf(" !!! pt2tab entry is DIFFERENT - m: %p",
 				    PHYS_TO_VM_PAGE(pte2_pa(pte2)));
 			printf("\n");
 			if (dump_link_ok)
 				dump_link(pmap, i, invalid_ok);
 		} else
 			printf("0x%08X: Invalid entry 0x%08X\n", va, pte1);
 	}
 }
 
 static void
 dump_pt2tab(pmap_t pmap)
 {
 	uint32_t i;
 	pt2_entry_t pte2;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	printf("PT2TAB:\n");
 	for (i = 0; i < PT2TAB_ENTRIES; i++) {
 		pte2 = pte2_load(&pmap->pm_pt2tab[i]);
 		if (!pte2_is_valid(pte2))
 			continue;
 		va = i << PT2TAB_SHIFT;
 		pa = pte2_pa(pte2);
 		m = PHYS_TO_VM_PAGE(pa);
 		printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2,
 		    pte2_class(pte2), !!(pte2 & PTE2_S), m);
 		if (m != NULL)
 			printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld",
 			    m->hold_count, m->wire_count, m->flags, m->pindex);
 		printf("\n");
 	}
 }
 
 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print)
 {
 	/* XXX convert args. */
 	pmap_t pmap = (pmap_t)addr;
 	pt1_entry_t pte1;
 	pt2_entry_t pte2;
 	vm_offset_t va;
 	uint32_t i, start;
 
 	if (have_addr) {
 		printf("supported only on current pmap\n");
 		return;
 	}
 
 	pmap = PCPU_GET(curpmap);
 	printf("curpmap: 0x%08X\n", (uint32_t)pmap);
 	printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP);
 	printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab);
 
 	start = pte1_index((vm_offset_t)PT2MAP);
 	for (i = start; i < (start + NPT2_IN_PT2TAB); i++) {
 		pte1 = pte1_load(&pmap->pm_pt1[i]);
 		if (pte1 == 0)
 			continue;
 		va = i << PTE1_SHIFT;
 		if (pte1_is_section(pte1)) {
 			printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1,
 			    !!(pte1 & PTE1_S));
 			dump_section(pmap, i);
 		} else if (pte1_is_link(pte1)) {
 			pte2 = pte2_load(pmap_pt2tab_entry(pmap, va));
 			printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va,
 			    pte1, pte2);
 			if (pte2 == 0)
 				printf("  !!! pt2tab entry is ZERO\n");
 		} else
 			printf("0x%08X: Invalid entry 0x%08X\n", va, pte1);
 	}
 	dump_pt2tab(pmap);
 }
 #endif
Index: user/ngie/bsnmp_cleanup/sys/arm/include/cpu-v6.h
===================================================================
--- user/ngie/bsnmp_cleanup/sys/arm/include/cpu-v6.h	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/arm/include/cpu-v6.h	(revision 298468)
@@ -1,586 +1,634 @@
 /*-
  * Copyright 2014 Svatopluk Kraus <onwahe@gmail.com>
  * Copyright 2014 Michal Meloun <meloun@miracle.cz>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef MACHINE_CPU_V6_H
 #define MACHINE_CPU_V6_H
 
 /* There are no user serviceable parts here, they may change without notice */
 #ifndef _KERNEL
 #error Only include this file in the kernel
 #endif
 
 #include <machine/acle-compat.h>
 #include <machine/atomic.h>
 #include <machine/cpufunc.h>
 #include <machine/cpuinfo.h>
 #include <machine/sysreg.h>
 
 #if __ARM_ARCH < 6
 #error Only include this file for ARMv6
 #else
 
 #define CPU_ASID_KERNEL 0
 
 void dcache_wbinv_poc_all(void); /* !!! NOT SMP coherent function !!! */
 vm_offset_t dcache_wb_pou_checked(vm_offset_t, vm_size_t);
 vm_offset_t icache_inv_pou_checked(vm_offset_t, vm_size_t);
 
 #ifdef DEV_PMU
 #include <sys/pcpu.h>
 #define	PMU_OVSR_C		0x80000000	/* Cycle Counter */
 extern uint32_t	ccnt_hi[MAXCPU];
 extern int pmu_attched;
 #endif /* DEV_PMU */
 
 
 /*
  * Macros to generate CP15 (system control processor) read/write functions.
  */
 #define _FX(s...) #s
 
 #define _RF0(fname, aname...)						\
 static __inline register_t						\
 fname(void)								\
 {									\
 	register_t reg;							\
 	__asm __volatile("mrc\t" _FX(aname): "=r" (reg));		\
 	return(reg);							\
 }
 
 #define _R64F0(fname, aname)						\
 static __inline uint64_t						\
 fname(void)								\
 {									\
 	uint64_t reg;							\
 	__asm __volatile("mrrc\t" _FX(aname): "=r" (reg));		\
 	return(reg);							\
 }
 
 #define _WF0(fname, aname...)						\
 static __inline void							\
 fname(void)								\
 {									\
 	__asm __volatile("mcr\t" _FX(aname));				\
 }
 
 #define _WF1(fname, aname...)						\
 static __inline void							\
 fname(register_t reg)							\
 {									\
 	__asm __volatile("mcr\t" _FX(aname):: "r" (reg));		\
 }
 
 #define _W64F1(fname, aname...)						\
 static __inline void							\
 fname(uint64_t reg)							\
 {									\
 	__asm __volatile("mcrr\t" _FX(aname):: "r" (reg));		\
 }
 
 /*
  * Raw CP15  maintenance operations
  * !!! not for external use !!!
  */
 
 /* TLB */
 
 _WF0(_CP15_TLBIALL, CP15_TLBIALL)		/* Invalidate entire unified TLB */
 #if __ARM_ARCH >= 7 && defined SMP
 _WF0(_CP15_TLBIALLIS, CP15_TLBIALLIS)		/* Invalidate entire unified TLB IS */
 #endif
 _WF1(_CP15_TLBIASID, CP15_TLBIASID(%0))		/* Invalidate unified TLB by ASID */
 #if __ARM_ARCH >= 7 && defined SMP
 _WF1(_CP15_TLBIASIDIS, CP15_TLBIASIDIS(%0))	/* Invalidate unified TLB by ASID IS */
 #endif
 _WF1(_CP15_TLBIMVAA, CP15_TLBIMVAA(%0))		/* Invalidate unified TLB by MVA, all ASID */
 #if __ARM_ARCH >= 7 && defined SMP
 _WF1(_CP15_TLBIMVAAIS, CP15_TLBIMVAAIS(%0))	/* Invalidate unified TLB by MVA, all ASID IS */
 #endif
 _WF1(_CP15_TLBIMVA, CP15_TLBIMVA(%0))		/* Invalidate unified TLB by MVA */
 
 _WF1(_CP15_TTB_SET, CP15_TTBR0(%0))
 
 /* Cache and Branch predictor */
 
 _WF0(_CP15_BPIALL, CP15_BPIALL)			/* Branch predictor invalidate all */
 #if __ARM_ARCH >= 7 && defined SMP
 _WF0(_CP15_BPIALLIS, CP15_BPIALLIS)		/* Branch predictor invalidate all IS */
 #endif
 _WF1(_CP15_BPIMVA, CP15_BPIMVA(%0))		/* Branch predictor invalidate by MVA */
 _WF1(_CP15_DCCIMVAC, CP15_DCCIMVAC(%0))		/* Data cache clean and invalidate by MVA PoC */
 _WF1(_CP15_DCCISW, CP15_DCCISW(%0))		/* Data cache clean and invalidate by set/way */
 _WF1(_CP15_DCCMVAC, CP15_DCCMVAC(%0))		/* Data cache clean by MVA PoC */
 #if __ARM_ARCH >= 7
 _WF1(_CP15_DCCMVAU, CP15_DCCMVAU(%0))		/* Data cache clean by MVA PoU */
 #endif
 _WF1(_CP15_DCCSW, CP15_DCCSW(%0))		/* Data cache clean by set/way */
 _WF1(_CP15_DCIMVAC, CP15_DCIMVAC(%0))		/* Data cache invalidate by MVA PoC */
 _WF1(_CP15_DCISW, CP15_DCISW(%0))		/* Data cache invalidate by set/way */
 _WF0(_CP15_ICIALLU, CP15_ICIALLU)		/* Instruction cache invalidate all PoU */
 #if __ARM_ARCH >= 7 && defined SMP
 _WF0(_CP15_ICIALLUIS, CP15_ICIALLUIS)		/* Instruction cache invalidate all PoU IS */
 #endif
 _WF1(_CP15_ICIMVAU, CP15_ICIMVAU(%0))		/* Instruction cache invalidate */
 
 /*
  * Publicly accessible functions
  */
 
 /* CP14 Debug Registers */
 _RF0(cp14_dbgdidr_get, CP14_DBGDIDR(%0))
 _RF0(cp14_dbgprsr_get, CP14_DBGPRSR(%0))
 _RF0(cp14_dbgoslsr_get, CP14_DBGOSLSR(%0))
 _RF0(cp14_dbgosdlr_get, CP14_DBGOSDLR(%0))
 _RF0(cp14_dbgdscrint_get, CP14_DBGDSCRint(%0))
 
 _WF1(cp14_dbgdscr_v6_set, CP14_DBGDSCRext_V6(%0))
 _WF1(cp14_dbgdscr_v7_set, CP14_DBGDSCRext_V7(%0))
 _WF1(cp14_dbgvcr_set, CP14_DBGVCR(%0))
 _WF1(cp14_dbgoslar_set, CP14_DBGOSLAR(%0))
 
 /* Various control registers */
 
 _RF0(cp15_cpacr_get, CP15_CPACR(%0))
 _WF1(cp15_cpacr_set, CP15_CPACR(%0))
 _RF0(cp15_dfsr_get, CP15_DFSR(%0))
 _RF0(cp15_ifsr_get, CP15_IFSR(%0))
 _WF1(cp15_prrr_set, CP15_PRRR(%0))
 _WF1(cp15_nmrr_set, CP15_NMRR(%0))
 _RF0(cp15_ttbr_get, CP15_TTBR0(%0))
 _RF0(cp15_dfar_get, CP15_DFAR(%0))
 #if __ARM_ARCH >= 7
 _RF0(cp15_ifar_get, CP15_IFAR(%0))
 _RF0(cp15_l2ctlr_get, CP15_L2CTLR(%0))
 #endif
 _RF0(cp15_actlr_get, CP15_ACTLR(%0))
 _WF1(cp15_actlr_set, CP15_ACTLR(%0))
 _WF1(cp15_ats1cpr_set, CP15_ATS1CPR(%0))
 _WF1(cp15_ats1cpw_set, CP15_ATS1CPW(%0))
+_WF1(cp15_ats1cur_set, CP15_ATS1CUR(%0))
+_WF1(cp15_ats1cuw_set, CP15_ATS1CUW(%0))
 _RF0(cp15_par_get, CP15_PAR(%0))
 _RF0(cp15_sctlr_get, CP15_SCTLR(%0))
 
 /*CPU id registers */
 _RF0(cp15_midr_get, CP15_MIDR(%0))
 _RF0(cp15_ctr_get, CP15_CTR(%0))
 _RF0(cp15_tcmtr_get, CP15_TCMTR(%0))
 _RF0(cp15_tlbtr_get, CP15_TLBTR(%0))
 _RF0(cp15_mpidr_get, CP15_MPIDR(%0))
 _RF0(cp15_revidr_get, CP15_REVIDR(%0))
 _RF0(cp15_ccsidr_get, CP15_CCSIDR(%0))
 _RF0(cp15_clidr_get, CP15_CLIDR(%0))
 _RF0(cp15_aidr_get, CP15_AIDR(%0))
 _WF1(cp15_csselr_set, CP15_CSSELR(%0))
 _RF0(cp15_id_pfr0_get, CP15_ID_PFR0(%0))
 _RF0(cp15_id_pfr1_get, CP15_ID_PFR1(%0))
 _RF0(cp15_id_dfr0_get, CP15_ID_DFR0(%0))
 _RF0(cp15_id_afr0_get, CP15_ID_AFR0(%0))
 _RF0(cp15_id_mmfr0_get, CP15_ID_MMFR0(%0))
 _RF0(cp15_id_mmfr1_get, CP15_ID_MMFR1(%0))
 _RF0(cp15_id_mmfr2_get, CP15_ID_MMFR2(%0))
 _RF0(cp15_id_mmfr3_get, CP15_ID_MMFR3(%0))
 _RF0(cp15_id_isar0_get, CP15_ID_ISAR0(%0))
 _RF0(cp15_id_isar1_get, CP15_ID_ISAR1(%0))
 _RF0(cp15_id_isar2_get, CP15_ID_ISAR2(%0))
 _RF0(cp15_id_isar3_get, CP15_ID_ISAR3(%0))
 _RF0(cp15_id_isar4_get, CP15_ID_ISAR4(%0))
 _RF0(cp15_id_isar5_get, CP15_ID_ISAR5(%0))
 _RF0(cp15_cbar_get, CP15_CBAR(%0))
 
 /* Performance Monitor registers */
 
 #if __ARM_ARCH == 6 && defined(CPU_ARM1176)
 _RF0(cp15_pmuserenr_get, CP15_PMUSERENR(%0))
 _WF1(cp15_pmuserenr_set, CP15_PMUSERENR(%0))
 _RF0(cp15_pmcr_get, CP15_PMCR(%0))
 _WF1(cp15_pmcr_set, CP15_PMCR(%0))
 _RF0(cp15_pmccntr_get, CP15_PMCCNTR(%0))
 _WF1(cp15_pmccntr_set, CP15_PMCCNTR(%0))
 #elif __ARM_ARCH > 6
 _RF0(cp15_pmcr_get, CP15_PMCR(%0))
 _WF1(cp15_pmcr_set, CP15_PMCR(%0))
 _RF0(cp15_pmcnten_get, CP15_PMCNTENSET(%0))
 _WF1(cp15_pmcnten_set, CP15_PMCNTENSET(%0))
 _WF1(cp15_pmcnten_clr, CP15_PMCNTENCLR(%0))
 _RF0(cp15_pmovsr_get, CP15_PMOVSR(%0))
 _WF1(cp15_pmovsr_set, CP15_PMOVSR(%0))
 _WF1(cp15_pmswinc_set, CP15_PMSWINC(%0))
 _RF0(cp15_pmselr_get, CP15_PMSELR(%0))
 _WF1(cp15_pmselr_set, CP15_PMSELR(%0))
 _RF0(cp15_pmccntr_get, CP15_PMCCNTR(%0))
 _WF1(cp15_pmccntr_set, CP15_PMCCNTR(%0))
 _RF0(cp15_pmxevtyper_get, CP15_PMXEVTYPER(%0))
 _WF1(cp15_pmxevtyper_set, CP15_PMXEVTYPER(%0))
 _RF0(cp15_pmxevcntr_get, CP15_PMXEVCNTRR(%0))
 _WF1(cp15_pmxevcntr_set, CP15_PMXEVCNTRR(%0))
 _RF0(cp15_pmuserenr_get, CP15_PMUSERENR(%0))
 _WF1(cp15_pmuserenr_set, CP15_PMUSERENR(%0))
 _RF0(cp15_pminten_get, CP15_PMINTENSET(%0))
 _WF1(cp15_pminten_set, CP15_PMINTENSET(%0))
 _WF1(cp15_pminten_clr, CP15_PMINTENCLR(%0))
 #endif
 
 _RF0(cp15_tpidrurw_get, CP15_TPIDRURW(%0))
 _WF1(cp15_tpidrurw_set, CP15_TPIDRURW(%0))
 _RF0(cp15_tpidruro_get, CP15_TPIDRURO(%0))
 _WF1(cp15_tpidruro_set, CP15_TPIDRURO(%0))
 _RF0(cp15_tpidrpwr_get, CP15_TPIDRPRW(%0))
 _WF1(cp15_tpidrpwr_set, CP15_TPIDRPRW(%0))
 
 /* Generic Timer registers - only use when you know the hardware is available */
 _RF0(cp15_cntfrq_get, CP15_CNTFRQ(%0))
 _WF1(cp15_cntfrq_set, CP15_CNTFRQ(%0))
 _RF0(cp15_cntkctl_get, CP15_CNTKCTL(%0))
 _WF1(cp15_cntkctl_set, CP15_CNTKCTL(%0))
 _RF0(cp15_cntp_tval_get, CP15_CNTP_TVAL(%0))
 _WF1(cp15_cntp_tval_set, CP15_CNTP_TVAL(%0))
 _RF0(cp15_cntp_ctl_get, CP15_CNTP_CTL(%0))
 _WF1(cp15_cntp_ctl_set, CP15_CNTP_CTL(%0))
 _RF0(cp15_cntv_tval_get, CP15_CNTV_TVAL(%0))
 _WF1(cp15_cntv_tval_set, CP15_CNTV_TVAL(%0))
 _RF0(cp15_cntv_ctl_get, CP15_CNTV_CTL(%0))
 _WF1(cp15_cntv_ctl_set, CP15_CNTV_CTL(%0))
 _RF0(cp15_cnthctl_get, CP15_CNTHCTL(%0))
 _WF1(cp15_cnthctl_set, CP15_CNTHCTL(%0))
 _RF0(cp15_cnthp_tval_get, CP15_CNTHP_TVAL(%0))
 _WF1(cp15_cnthp_tval_set, CP15_CNTHP_TVAL(%0))
 _RF0(cp15_cnthp_ctl_get, CP15_CNTHP_CTL(%0))
 _WF1(cp15_cnthp_ctl_set, CP15_CNTHP_CTL(%0))
 
 _R64F0(cp15_cntpct_get, CP15_CNTPCT(%Q0, %R0))
 _R64F0(cp15_cntvct_get, CP15_CNTVCT(%Q0, %R0))
 _R64F0(cp15_cntp_cval_get, CP15_CNTP_CVAL(%Q0, %R0))
 _W64F1(cp15_cntp_cval_set, CP15_CNTP_CVAL(%Q0, %R0))
 _R64F0(cp15_cntv_cval_get, CP15_CNTV_CVAL(%Q0, %R0))
 _W64F1(cp15_cntv_cval_set, CP15_CNTV_CVAL(%Q0, %R0))
 _R64F0(cp15_cntvoff_get, CP15_CNTVOFF(%Q0, %R0))
 _W64F1(cp15_cntvoff_set, CP15_CNTVOFF(%Q0, %R0))
 _R64F0(cp15_cnthp_cval_get, CP15_CNTHP_CVAL(%Q0, %R0))
 _W64F1(cp15_cnthp_cval_set, CP15_CNTHP_CVAL(%Q0, %R0))
 
 #undef	_FX
 #undef	_RF0
 #undef	_WF0
 #undef	_WF1
 
 /*
  * TLB maintenance operations.
  */
 
 /* Local (i.e. not broadcasting ) operations.  */
 
 /* Flush all TLB entries (even global). */
 static __inline void
 tlb_flush_all_local(void)
 {
 
 	dsb();
 	_CP15_TLBIALL();
 	dsb();
 }
 
 /* Flush all not global TLB entries. */
 static __inline void
 tlb_flush_all_ng_local(void)
 {
 
 	dsb();
 	_CP15_TLBIASID(CPU_ASID_KERNEL);
 	dsb();
 }
 
 /* Flush single TLB entry (even global). */
 static __inline void
 tlb_flush_local(vm_offset_t va)
 {
 
 	KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va));
 
 	dsb();
 	_CP15_TLBIMVA(va | CPU_ASID_KERNEL);
 	dsb();
 }
 
 /* Flush range of TLB entries (even global). */
 static __inline void
 tlb_flush_range_local(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va));
 	KASSERT((size & PAGE_MASK) == 0, ("%s: size %#x not aligned", __func__,
 	    size));
 
 	dsb();
 	for (; va < eva; va += PAGE_SIZE)
 		_CP15_TLBIMVA(va | CPU_ASID_KERNEL);
 	dsb();
 }
 
 /* Broadcasting operations. */
 #if __ARM_ARCH >= 7 && defined SMP
 
 static __inline void
 tlb_flush_all(void)
 {
 
 	dsb();
 	_CP15_TLBIALLIS();
 	dsb();
 }
 
 static __inline void
 tlb_flush_all_ng(void)
 {
 
 	dsb();
 	_CP15_TLBIASIDIS(CPU_ASID_KERNEL);
 	dsb();
 }
 
 static __inline void
 tlb_flush(vm_offset_t va)
 {
 
 	KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va));
 
 	dsb();
 	_CP15_TLBIMVAAIS(va);
 	dsb();
 }
 
 static __inline void
 tlb_flush_range(vm_offset_t va,  vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va));
 	KASSERT((size & PAGE_MASK) == 0, ("%s: size %#x not aligned", __func__,
 	    size));
 
 	dsb();
 	for (; va < eva; va += PAGE_SIZE)
 		_CP15_TLBIMVAAIS(va);
 	dsb();
 }
 #else /* SMP */
 
 #define tlb_flush_all() 		tlb_flush_all_local()
 #define tlb_flush_all_ng() 		tlb_flush_all_ng_local()
 #define tlb_flush(va) 			tlb_flush_local(va)
 #define tlb_flush_range(va, size) 	tlb_flush_range_local(va, size)
 
 #endif /* SMP */
 
 /*
  * Cache maintenance operations.
  */
 
 /*  Sync I and D caches to PoU */
 static __inline void
 icache_sync(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	dsb();
 	va &= ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 #if __ARM_ARCH >= 7 && defined SMP
 		_CP15_DCCMVAU(va);
 #else
 		_CP15_DCCMVAC(va);
 #endif
 	}
 	dsb();
 #if __ARM_ARCH >= 7 && defined SMP
 	_CP15_ICIALLUIS();
 #else
 	_CP15_ICIALLU();
 #endif
 	dsb();
 	isb();
 }
 
 /*  Invalidate I cache */
 static __inline void
 icache_inv_all(void)
 {
 #if __ARM_ARCH >= 7 && defined SMP
 	_CP15_ICIALLUIS();
 #else
 	_CP15_ICIALLU();
 #endif
 	dsb();
 	isb();
 }
 
 /* Invalidate branch predictor buffer */
 static __inline void
 bpb_inv_all(void)
 {
 #if __ARM_ARCH >= 7 && defined SMP
 	_CP15_BPIALLIS();
 #else
 	_CP15_BPIALL();
 #endif
 	dsb();
 	isb();
 }
 
 /* Write back D-cache to PoU */
 static __inline void
 dcache_wb_pou(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	dsb();
 	va &= ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 #if __ARM_ARCH >= 7 && defined SMP
 		_CP15_DCCMVAU(va);
 #else
 		_CP15_DCCMVAC(va);
 #endif
 	}
 	dsb();
 }
 
 /*
  * Invalidate D-cache to PoC
  *
  * Caches are invalidated from outermost to innermost as fresh cachelines
  * flow in this direction. In given range, if there was no dirty cacheline
  * in any cache before, no stale cacheline should remain in them after this
  * operation finishes.
  */
 static __inline void
 dcache_inv_poc(vm_offset_t va, vm_paddr_t pa, vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	dsb();
 	/* invalidate L2 first */
 	cpu_l2cache_inv_range(pa, size);
 
 	/* then L1 */
 	va &= ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 		_CP15_DCIMVAC(va);
 	}
 	dsb();
 }
 
 /*
  * Discard D-cache lines to PoC, prior to overwrite by DMA engine.
  *
  * Normal invalidation does L2 then L1 to ensure that stale data from L2 doesn't
  * flow into L1 while invalidating.  This routine is intended to be used only
  * when invalidating a buffer before a DMA operation loads new data into memory.
  * The concern in this case is that dirty lines are not evicted to main memory,
  * overwriting the DMA data.  For that reason, the L1 is done first to ensure
  * that an evicted L1 line doesn't flow to L2 after the L2 has been cleaned.
  */
 static __inline void
 dcache_inv_poc_dma(vm_offset_t va, vm_paddr_t pa, vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	/* invalidate L1 first */
 	dsb();
 	va &= ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 		_CP15_DCIMVAC(va);
 	}
 	dsb();
 
 	/* then L2 */
 	cpu_l2cache_inv_range(pa, size);
 }
 
 /*
  * Write back D-cache to PoC
  *
  * Caches are written back from innermost to outermost as dirty cachelines
  * flow in this direction. In given range, no dirty cacheline should remain
  * in any cache after this operation finishes.
  */
 static __inline void
 dcache_wb_poc(vm_offset_t va, vm_paddr_t pa, vm_size_t size)
 {
 	vm_offset_t eva = va + size;
 
 	dsb();
 	va &= ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 		_CP15_DCCMVAC(va);
 	}
 	dsb();
 
 	cpu_l2cache_wb_range(pa, size);
 }
 
 /* Write back and invalidate D-cache to PoC */
 static __inline void
 dcache_wbinv_poc(vm_offset_t sva, vm_paddr_t pa, vm_size_t size)
 {
 	vm_offset_t va;
 	vm_offset_t eva = sva + size;
 
 	dsb();
 	/* write back L1 first */
 	va = sva & ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 		_CP15_DCCMVAC(va);
 	}
 	dsb();
 
 	/* then write back and invalidate L2 */
 	cpu_l2cache_wbinv_range(pa, size);
 
 	/* then invalidate L1 */
 	va = sva & ~cpuinfo.dcache_line_mask;
 	for ( ; va < eva; va += cpuinfo.dcache_line_size) {
 		_CP15_DCIMVAC(va);
 	}
 	dsb();
 }
 
 /* Set TTB0 register */
 static __inline void
 cp15_ttbr_set(uint32_t reg)
 {
 	dsb();
 	_CP15_TTB_SET(reg);
 	dsb();
 	_CP15_BPIALL();
 	dsb();
 	isb();
 	tlb_flush_all_ng_local();
 }
-#endif /* _KERNEL */
+
+/*
+ * Functions for address checking:
+ *
+ *  cp15_ats1cpr_check() ... check stage 1 privileged (PL1) read access
+ *  cp15_ats1cpw_check() ... check stage 1 privileged (PL1) write access
+ *  cp15_ats1cur_check() ... check stage 1 unprivileged (PL0) read access
+ *  cp15_ats1cuw_check() ... check stage 1 unprivileged (PL0) write access
+ *
+ * They must be called while interrupts are disabled to get consistent result.
+ */
+static __inline int
+cp15_ats1cpr_check(vm_offset_t addr)
+{
+
+	cp15_ats1cpr_set(addr);
+	isb();
+	return (cp15_par_get() & 0x01 ? EFAULT : 0);
+}
+
+static __inline int
+cp15_ats1cpw_check(vm_offset_t addr)
+{
+
+	cp15_ats1cpw_set(addr);
+	isb();
+	return (cp15_par_get() & 0x01 ? EFAULT : 0);
+}
+
+static __inline int
+cp15_ats1cur_check(vm_offset_t addr)
+{
+
+	cp15_ats1cur_set(addr);
+	isb();
+	return (cp15_par_get() & 0x01 ? EFAULT : 0);
+}
+
+static __inline int
+cp15_ats1cuw_check(vm_offset_t addr)
+{
+
+	cp15_ats1cuw_set(addr);
+	isb();
+	return (cp15_par_get() & 0x01 ? EFAULT : 0);
+}
+#endif /* !__ARM_ARCH < 6 */
 
 #endif /* !MACHINE_CPU_V6_H */
Index: user/ngie/bsnmp_cleanup/sys/arm/include/pmap_var.h
===================================================================
--- user/ngie/bsnmp_cleanup/sys/arm/include/pmap_var.h	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/arm/include/pmap_var.h	(revision 298468)
@@ -1,512 +1,494 @@
 /*-
  * Copyright 2014 Svatopluk Kraus <onwahe@gmail.com>
  * Copyright 2014 Michal Meloun <meloun@miracle.cz>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PMAP_VAR_H_
 #define _MACHINE_PMAP_VAR_H_
 
 #include <machine/cpu-v6.h>
 #include <machine/pte-v6.h>
 /*
  *  Various PMAP defines, exports, and inline functions
  *  definitions also usable in other MD code.
  */
 
 /*  A number of pages in L1 page table. */
 #define NPG_IN_PT1	(NB_IN_PT1 / PAGE_SIZE)
 
 /*  A number of L2 page tables in a page. */
 #define NPT2_IN_PG	(PAGE_SIZE / NB_IN_PT2)
 
 /*  A number of L2 page table entries in a page. */
 #define NPTE2_IN_PG	(NPT2_IN_PG * NPTE2_IN_PT2)
 
 #ifdef _KERNEL
 
 /*
  *  A L2 page tables page contains NPT2_IN_PG L2 page tables. Masking of
  *  pte1_idx by PT2PG_MASK gives us an index to associated L2 page table
  *  in a page. The PT2PG_SHIFT definition depends on NPT2_IN_PG strictly.
  *  I.e., (1 << PT2PG_SHIFT) == NPT2_IN_PG must be fulfilled.
  */
 #define PT2PG_SHIFT	2
 #define PT2PG_MASK	((1 << PT2PG_SHIFT) - 1)
 
 /*
  *  A PT2TAB holds all allocated L2 page table pages in a pmap.
  *  Right shifting of virtual address by PT2TAB_SHIFT gives us an index
  *  to L2 page table page in PT2TAB which holds the address mapping.
  */
 #define PT2TAB_ENTRIES  (NPTE1_IN_PT1 / NPT2_IN_PG)
 #define PT2TAB_SHIFT	(PTE1_SHIFT + PT2PG_SHIFT)
 
 /*
  *  All allocated L2 page table pages in a pmap are mapped into PT2MAP space.
  *  An virtual address right shifting by PT2MAP_SHIFT gives us an index to PTE2
  *  which maps the address.
  */
 #define PT2MAP_SIZE	(NPTE1_IN_PT1 * NB_IN_PT2)
 #define PT2MAP_SHIFT	PTE2_SHIFT
 
 extern pt1_entry_t *kern_pt1;
 extern pt2_entry_t *kern_pt2tab;
 extern pt2_entry_t *PT2MAP;
 
 /*
  *  Virtual interface for L1 page table management.
  */
 
 static __inline u_int
 pte1_index(vm_offset_t va)
 {
 
 	return (va >> PTE1_SHIFT);
 }
 
 static __inline pt1_entry_t *
 pte1_ptr(pt1_entry_t *pt1, vm_offset_t va)
 {
 
 	return (pt1 + pte1_index(va));
 }
 
 static __inline vm_offset_t
 pte1_trunc(vm_offset_t va)
 {
 
 	return (va & PTE1_FRAME);
 }
 
 static __inline vm_offset_t
 pte1_roundup(vm_offset_t va)
 {
 
 	return ((va + PTE1_OFFSET) & PTE1_FRAME);
 }
 
 /*
  *  Virtual interface for L1 page table entries management.
  *
  *  XXX: Some of the following functions now with a synchronization barrier
  *  are called in a loop, so it could be useful to have two versions of them.
  *  One with the barrier and one without the barrier. In this case, pure
  *  barrier pte1_sync() should be implemented as well.
  */
 static __inline void
 pte1_sync(pt1_entry_t *pte1p)
 {
 
 	dsb();
 #ifndef PMAP_PTE_NOCACHE
 	if (!cpuinfo.coherent_walk)
 		dcache_wb_pou((vm_offset_t)pte1p, sizeof(*pte1p));
 #endif
 }
 
 static __inline void
 pte1_sync_range(pt1_entry_t *pte1p, vm_size_t size)
 {
 
 	dsb();
 #ifndef PMAP_PTE_NOCACHE
 	if (!cpuinfo.coherent_walk)
 		dcache_wb_pou((vm_offset_t)pte1p, size);
 #endif
 }
 
 static __inline void
 pte1_store(pt1_entry_t *pte1p, pt1_entry_t pte1)
 {
 
-	atomic_store_rel_int(pte1p, pte1);
+	dmb();
+	*pte1p = pte1;
 	pte1_sync(pte1p);
 }
 
 static __inline void
 pte1_clear(pt1_entry_t *pte1p)
 {
 
 	pte1_store(pte1p, 0);
 }
 
 static __inline void
 pte1_clear_bit(pt1_entry_t *pte1p, uint32_t bit)
 {
 
-	atomic_clear_int(pte1p, bit);
+	*pte1p &= ~bit;
 	pte1_sync(pte1p);
 }
 
 static __inline boolean_t
-pte1_cmpset(pt1_entry_t *pte1p, pt1_entry_t opte1, pt1_entry_t npte1)
-{
-	boolean_t ret;
-
-	ret = atomic_cmpset_int(pte1p, opte1, npte1);
-	if (ret) pte1_sync(pte1p);
-
-	return (ret);
-}
-
-static __inline boolean_t
 pte1_is_link(pt1_entry_t pte1)
 {
 
 	return ((pte1 & L1_TYPE_MASK) == L1_TYPE_C);
 }
 
 static __inline int
 pte1_is_section(pt1_entry_t pte1)
 {
 
 	return ((pte1 & L1_TYPE_MASK) == L1_TYPE_S);
 }
 
 static __inline boolean_t
 pte1_is_dirty(pt1_entry_t pte1)
 {
 
 	return ((pte1 & (PTE1_NM | PTE1_RO)) == 0);
 }
 
 static __inline boolean_t
 pte1_is_global(pt1_entry_t pte1)
 {
 
 	return ((pte1 & PTE1_NG) == 0);
 }
 
 static __inline boolean_t
 pte1_is_valid(pt1_entry_t pte1)
 {
 	int l1_type;
 
 	l1_type = pte1 & L1_TYPE_MASK;
 	return ((l1_type == L1_TYPE_C) || (l1_type == L1_TYPE_S));
 }
 
 static __inline boolean_t
 pte1_is_wired(pt1_entry_t pte1)
 {
 
 	return (pte1 & PTE1_W);
 }
 
 static __inline pt1_entry_t
 pte1_load(pt1_entry_t *pte1p)
 {
 	pt1_entry_t pte1;
 
 	pte1 = *pte1p;
 	return (pte1);
 }
 
 static __inline pt1_entry_t
 pte1_load_clear(pt1_entry_t *pte1p)
 {
 	pt1_entry_t opte1;
 
-	opte1 = atomic_readandclear_int(pte1p);
+	opte1 = *pte1p;
+	*pte1p = 0;
 	pte1_sync(pte1p);
 	return (opte1);
 }
 
 static __inline void
 pte1_set_bit(pt1_entry_t *pte1p, uint32_t bit)
 {
 
-	atomic_set_int(pte1p, bit);
+	*pte1p |= bit;
 	pte1_sync(pte1p);
 }
 
 static __inline vm_paddr_t
 pte1_pa(pt1_entry_t pte1)
 {
 
 	return ((vm_paddr_t)(pte1 & PTE1_FRAME));
 }
 
 static __inline vm_paddr_t
 pte1_link_pa(pt1_entry_t pte1)
 {
 
 	return ((vm_paddr_t)(pte1 & L1_C_ADDR_MASK));
 }
 
 /*
  *  Virtual interface for L2 page table entries management.
  *
  *  XXX: Some of the following functions now with a synchronization barrier
  *  are called in a loop, so it could be useful to have two versions of them.
  *  One with the barrier and one without the barrier.
  */
 
 static __inline void
 pte2_sync(pt2_entry_t *pte2p)
 {
 
 	dsb();
 #ifndef PMAP_PTE_NOCACHE
 	if (!cpuinfo.coherent_walk)
 		dcache_wb_pou((vm_offset_t)pte2p, sizeof(*pte2p));
 #endif
 }
 
 static __inline void
 pte2_sync_range(pt2_entry_t *pte2p, vm_size_t size)
 {
 
 	dsb();
 #ifndef PMAP_PTE_NOCACHE
 	if (!cpuinfo.coherent_walk)
 		dcache_wb_pou((vm_offset_t)pte2p, size);
 #endif
 }
 
 static __inline void
 pte2_store(pt2_entry_t *pte2p, pt2_entry_t pte2)
 {
 
-	atomic_store_rel_int(pte2p, pte2);
+	dmb();
+	*pte2p = pte2;
 	pte2_sync(pte2p);
 }
 
 static __inline void
 pte2_clear(pt2_entry_t *pte2p)
 {
 
 	pte2_store(pte2p, 0);
 }
 
 static __inline void
 pte2_clear_bit(pt2_entry_t *pte2p, uint32_t bit)
 {
 
-	atomic_clear_int(pte2p, bit);
+	*pte2p &= ~bit;
 	pte2_sync(pte2p);
 }
 
 static __inline boolean_t
-pte2_cmpset(pt2_entry_t *pte2p, pt2_entry_t opte2, pt2_entry_t npte2)
-{
-	boolean_t ret;
-
-	ret = atomic_cmpset_int(pte2p, opte2, npte2);
-	if (ret) pte2_sync(pte2p);
-
-	return (ret);
-}
-
-static __inline boolean_t
 pte2_is_dirty(pt2_entry_t pte2)
 {
 
 	return ((pte2 & (PTE2_NM | PTE2_RO)) == 0);
 }
 
 static __inline boolean_t
 pte2_is_global(pt2_entry_t pte2)
 {
 
 	return ((pte2 & PTE2_NG) == 0);
 }
 
 static __inline boolean_t
 pte2_is_valid(pt2_entry_t pte2)
 {
 
 	return (pte2 & PTE2_V);
 }
 
 static __inline boolean_t
 pte2_is_wired(pt2_entry_t pte2)
 {
 
 	return (pte2 & PTE2_W);
 }
 
 static __inline pt2_entry_t
 pte2_load(pt2_entry_t *pte2p)
 {
 	pt2_entry_t pte2;
 
 	pte2 = *pte2p;
 	return (pte2);
 }
 
 static __inline pt2_entry_t
 pte2_load_clear(pt2_entry_t *pte2p)
 {
 	pt2_entry_t opte2;
 
-	opte2 = atomic_readandclear_int(pte2p);
+	opte2 = *pte2p;
+	*pte2p = 0;
 	pte2_sync(pte2p);
 	return (opte2);
 }
 
 static __inline void
 pte2_set_bit(pt2_entry_t *pte2p, uint32_t bit)
 {
 
-	atomic_set_int(pte2p, bit);
+	*pte2p |= bit;
 	pte2_sync(pte2p);
 }
 
 static __inline void
 pte2_set_wired(pt2_entry_t *pte2p, boolean_t wired)
 {
 
 	/*
 	 * Wired bit is transparent for page table walk,
 	 * so pte2_sync() is not needed.
 	 */
 	if (wired)
-		atomic_set_int(pte2p, PTE2_W);
+		*pte2p |= PTE2_W;
 	else
-		atomic_clear_int(pte2p, PTE2_W);
+		*pte2p &= ~PTE2_W;
 }
 
 static __inline vm_paddr_t
 pte2_pa(pt2_entry_t pte2)
 {
 
 	return ((vm_paddr_t)(pte2 & PTE2_FRAME));
 }
 
 static __inline u_int
 pte2_attr(pt2_entry_t pte2)
 {
 
 	return ((u_int)(pte2 & PTE2_ATTR_MASK));
 }
 
 /*
  *  Virtual interface for L2 page tables mapping management.
  */
 
 static __inline u_int
 pt2tab_index(vm_offset_t va)
 {
 
 	return (va >> PT2TAB_SHIFT);
 }
 
 static __inline pt2_entry_t *
 pt2tab_entry(pt2_entry_t *pt2tab, vm_offset_t va)
 {
 
 	return (pt2tab + pt2tab_index(va));
 }
 
 static __inline void
 pt2tab_store(pt2_entry_t *pte2p, pt2_entry_t pte2)
 {
 
 	pte2_store(pte2p,pte2);
 }
 
 static __inline pt2_entry_t
 pt2tab_load(pt2_entry_t *pte2p)
 {
 
 	return (pte2_load(pte2p));
 }
 
 static __inline pt2_entry_t
 pt2tab_load_clear(pt2_entry_t *pte2p)
 {
 
 	return (pte2_load_clear(pte2p));
 }
 
 static __inline u_int
 pt2map_index(vm_offset_t va)
 {
 
 	return (va >> PT2MAP_SHIFT);
 }
 
 static __inline pt2_entry_t *
 pt2map_entry(vm_offset_t va)
 {
 
 	return (PT2MAP + pt2map_index(va));
 }
 
 /*
  *  Virtual interface for pmap structure & kernel shortcuts.
  */
 
 static __inline pt1_entry_t *
 pmap_pte1(pmap_t pmap, vm_offset_t va)
 {
 
 	return (pte1_ptr(pmap->pm_pt1, va));
 }
 
 static __inline pt1_entry_t *
 kern_pte1(vm_offset_t va)
 {
 
 	return (pte1_ptr(kern_pt1, va));
 }
 
 static __inline pt2_entry_t *
 pmap_pt2tab_entry(pmap_t pmap, vm_offset_t va)
 {
 
 	return (pt2tab_entry(pmap->pm_pt2tab, va));
 }
 
 static __inline pt2_entry_t *
 kern_pt2tab_entry(vm_offset_t va)
 {
 
 	return (pt2tab_entry(kern_pt2tab, va));
 }
 
 static __inline vm_page_t
 pmap_pt2_page(pmap_t pmap, vm_offset_t va)
 {
 	pt2_entry_t pte2;
 
 	pte2 = pte2_load(pmap_pt2tab_entry(pmap, va));
 	return (PHYS_TO_VM_PAGE(pte2 & PTE2_FRAME));
 }
 
 static __inline vm_page_t
 kern_pt2_page(vm_offset_t va)
 {
 	pt2_entry_t pte2;
 
 	pte2 = pte2_load(kern_pt2tab_entry(va));
 	return (PHYS_TO_VM_PAGE(pte2 & PTE2_FRAME));
 }
 
 #endif	/* _KERNEL */
 #endif	/* !_MACHINE_PMAP_VAR_H_ */
Index: user/ngie/bsnmp_cleanup/sys/contrib/rdma/krping/krping.c
===================================================================
--- user/ngie/bsnmp_cleanup/sys/contrib/rdma/krping/krping.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/contrib/rdma/krping/krping.c	(revision 298468)
@@ -1,3347 +1,3347 @@
 /*
  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/in.h>
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/sched.h>
 
 #include <asm/atomic.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
 #include "krping.h"
 #include "getopt.h"
 
 extern int krping_debug;
 #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x)
 #define PRINTF(cb, x...) log(LOG_INFO, x)
 #define BIND_INFO 1
 
 MODULE_AUTHOR("Steve Wise");
 MODULE_DESCRIPTION("RDMA ping client/server");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(krping, 1);
 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
 
 static __inline uint64_t
 get_cycles(void)
 {
 	uint32_t low, high;
 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
 	return (low | ((u_int64_t)high << 32));
 }
 
 typedef uint64_t cycles_t;
 
 enum mem_type {
 	DMA = 1,
 	FASTREG = 2,
 	MW = 3,
 	MR = 4
 };
 
 static const struct krping_option krping_opts[] = {
 	{"count", OPT_INT, 'C'},
 	{"size", OPT_INT, 'S'},
 	{"addr", OPT_STRING, 'a'},
 	{"port", OPT_INT, 'p'},
 	{"verbose", OPT_NOPARAM, 'v'},
 	{"validate", OPT_NOPARAM, 'V'},
 	{"server", OPT_NOPARAM, 's'},
 	{"client", OPT_NOPARAM, 'c'},
 	{"mem_mode", OPT_STRING, 'm'},
 	{"server_inv", OPT_NOPARAM, 'I'},
  	{"wlat", OPT_NOPARAM, 'l'},
  	{"rlat", OPT_NOPARAM, 'L'},
  	{"bw", OPT_NOPARAM, 'B'},
  	{"duplex", OPT_NOPARAM, 'd'},
  	{"txdepth", OPT_INT, 'T'},
  	{"poll", OPT_NOPARAM, 'P'},
  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
  	{"read_inv", OPT_NOPARAM, 'R'},
  	{"fr", OPT_INT, 'f'},
 	{NULL, 0, 0}
 };
 
 #define htonll(x) cpu_to_be64((x))
 #define ntohll(x) cpu_to_be64((x))
 
 static struct mutex krping_mutex;
 
 /*
  * List of running krping threads.
  */
 static LIST_HEAD(krping_cbs);
 
 /*
  * krping "ping/pong" loop:
  * 	client sends source rkey/addr/len
  *	server receives source rkey/add/len
  *	server rdma reads "ping" data from source
  * 	server sends "go ahead" on rdma read completion
  *	client sends sink rkey/addr/len
  * 	server receives sink rkey/addr/len
  * 	server rdma writes "pong" data to sink
  * 	server sends "go ahead" on rdma write completion
  * 	<repeat loop>
  */
 
 /*
  * These states are used to signal events between the completion handler
  * and the main client or server thread.
  *
  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
  * and RDMA_WRITE_COMPLETE for each ping.
  */
 enum test_state {
 	IDLE = 1,
 	CONNECT_REQUEST,
 	ADDR_RESOLVED,
 	ROUTE_RESOLVED,
 	CONNECTED,
 	RDMA_READ_ADV,
 	RDMA_READ_COMPLETE,
 	RDMA_WRITE_ADV,
 	RDMA_WRITE_COMPLETE,
 	ERROR
 };
 
 struct krping_rdma_info {
 	uint64_t buf;
 	uint32_t rkey;
 	uint32_t size;
 };
 
 /*
  * Default max buffer size for IO...
  */
 #define RPING_BUFSIZE 128*1024
 #define RPING_SQ_DEPTH 64
 
 /*
  * Control block struct.
  */
 struct krping_cb {
 	void *cookie;
 	int server;			/* 0 iff client */
 	struct ib_cq *cq;
 	struct ib_pd *pd;
 	struct ib_qp *qp;
 
 	enum mem_type mem;
 	struct ib_mr *dma_mr;
 
 	struct ib_fast_reg_page_list *page_list;
 	int page_list_len;
 	struct ib_send_wr fastreg_wr;
 	struct ib_send_wr invalidate_wr;
 	struct ib_mr *fastreg_mr;
 	int server_invalidate;
 	int read_inv;
 	u8 key;
 
 	struct ib_mw *mw;
 	struct ib_mw_bind bind_attr;
 
 	struct ib_recv_wr rq_wr;	/* recv work request record */
 	struct ib_sge recv_sgl;		/* recv single SGE */
 	struct krping_rdma_info recv_buf;/* malloc'd buffer */
 	u64 recv_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
 	struct ib_mr *recv_mr;
 
 	struct ib_send_wr sq_wr;	/* send work requrest record */
 	struct ib_sge send_sgl;
 	struct krping_rdma_info send_buf;/* single send buf */
 	u64 send_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
 	struct ib_mr *send_mr;
 
 	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
 	struct ib_sge rdma_sgl;		/* rdma single SGE */
 	char *rdma_buf;			/* used as rdma sink */
 	u64  rdma_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
 	struct ib_mr *rdma_mr;
 
 	uint32_t remote_rkey;		/* remote guys RKEY */
 	uint64_t remote_addr;		/* remote guys TO */
 	uint32_t remote_len;		/* remote guys LEN */
 
 	char *start_buf;		/* rdma read src */
 	u64  start_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
 	struct ib_mr *start_mr;
 
 	enum test_state state;		/* used for cond/signalling */
 	wait_queue_head_t sem;
 	struct krping_stats stats;
 
 	uint16_t port;			/* dst port in NBO */
 	struct in_addr addr;		/* dst addr in NBO */
 	char *addr_str;			/* dst addr string */
 	int verbose;			/* verbose logging */
 	int count;			/* ping count */
 	int size;			/* ping data size */
 	int validate;			/* validate ping data */
 	int wlat;			/* run wlat test */
 	int rlat;			/* run rlat test */
 	int bw;				/* run bw test */
 	int duplex;			/* run bw full duplex test */
 	int poll;			/* poll or block for rlat test */
 	int txdepth;			/* SQ depth */
 	int local_dma_lkey;		/* use 0 for lkey */
 	int frtest;			/* fastreg test */
 	int testnum;
 
 	/* CM stuff */
 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
 					/* listener on server side. */
 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
 	struct list_head list;
 };
 
 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
 				   struct rdma_cm_event *event)
 {
 	int ret;
 	struct krping_cb *cb = cma_id->context;
 
 	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
 	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
 		cb->state = ADDR_RESOLVED;
 		ret = rdma_resolve_route(cma_id, 2000);
 		if (ret) {
 			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
 			wake_up_interruptible(&cb->sem);
 		}
 		break;
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
 		cb->state = ROUTE_RESOLVED;
 		cb->child_cm_id = cma_id;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
 		if (cb->state == IDLE) {
 			cb->state = CONNECT_REQUEST;
 			cb->child_cm_id = cma_id;
 		} else {
 			PRINTF(cb, "Received connection request in wrong state"
 			    " (%d)\n", cb->state);
 		}
 		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
 		DEBUG_LOG(cb, "ESTABLISHED\n");
 		if (!cb->server) {
 			cb->state = CONNECTED;
 		}
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_ADDR_ERROR:
 	case RDMA_CM_EVENT_ROUTE_ERROR:
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 	case RDMA_CM_EVENT_REJECTED:
 		PRINTF(cb, "cma event %d, error %d\n", event->event,
 		       event->status);
 		cb->state = ERROR;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
 		PRINTF(cb, "DISCONNECT EVENT...\n");
 		cb->state = ERROR;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 		PRINTF(cb, "cma detected device removal!!!!\n");
 		break;
 
 	default:
 		PRINTF(cb, "oof bad type!\n");
 		wake_up_interruptible(&cb->sem);
 		break;
 	}
 	return 0;
 }
 
 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
 {
 	if (wc->byte_len != sizeof(cb->recv_buf)) {
 		PRINTF(cb, "Received bogus data, size %d\n", 
 		       wc->byte_len);
 		return -1;
 	}
 
 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
 	cb->remote_addr = ntohll(cb->recv_buf.buf);
 	cb->remote_len  = ntohl(cb->recv_buf.size);
 	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
 		  cb->remote_rkey, (unsigned long long)cb->remote_addr, 
 		  cb->remote_len);
 
 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
 		cb->state = RDMA_READ_ADV;
 	else
 		cb->state = RDMA_WRITE_ADV;
 
 	return 0;
 }
 
 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
 {
 	if (wc->byte_len != sizeof(cb->recv_buf)) {
 		PRINTF(cb, "Received bogus data, size %d\n", 
 		       wc->byte_len);
 		return -1;
 	}
 
 	if (cb->state == RDMA_READ_ADV)
 		cb->state = RDMA_WRITE_ADV;
 	else
 		cb->state = RDMA_WRITE_COMPLETE;
 
 	return 0;
 }
 
 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
 {
 	struct krping_cb *cb = ctx;
 	struct ib_wc wc;
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	BUG_ON(cb->cq != cq);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "cq completion in ERROR state\n");
 		return;
 	}
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest)
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
 		if (wc.status) {
 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
 				DEBUG_LOG(cb, "cq flushed\n");
 				continue;
 			} else {
 				PRINTF(cb, "cq completion failed with "
 				       "wr_id %jx status %d opcode %d vender_err %x\n",
 					(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
 				goto error;
 			}
 		}
 
 		switch (wc.opcode) {
 		case IB_WC_SEND:
 			DEBUG_LOG(cb, "send completion\n");
 			cb->stats.send_bytes += cb->send_sgl.length;
 			cb->stats.send_msgs++;
 			break;
 
 		case IB_WC_RDMA_WRITE:
 			DEBUG_LOG(cb, "rdma write completion\n");
 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
 			cb->stats.write_msgs++;
 			cb->state = RDMA_WRITE_COMPLETE;
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		case IB_WC_RDMA_READ:
 			DEBUG_LOG(cb, "rdma read completion\n");
 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
 			cb->stats.read_msgs++;
 			cb->state = RDMA_READ_COMPLETE;
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		case IB_WC_RECV:
 			DEBUG_LOG(cb, "recv completion\n");
 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
 			cb->stats.recv_msgs++;
 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest)
 				ret = server_recv(cb, &wc);
 			else
 				ret = cb->server ? server_recv(cb, &wc) :
 						   client_recv(cb, &wc);
 			if (ret) {
 				PRINTF(cb, "recv wc error: %d\n", ret);
 				goto error;
 			}
 
 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 			if (ret) {
 				PRINTF(cb, "post recv error: %d\n", 
 				       ret);
 				goto error;
 			}
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		default:
 			PRINTF(cb, 
 			       "%s:%d Unexpected opcode %d, Shutting down\n",
 			       __func__, __LINE__, wc.opcode);
 			goto error;
 		}
 	}
 	if (ret) {
 		PRINTF(cb, "poll error %d\n", ret);
 		goto error;
 	}
 	return;
 error:
 	cb->state = ERROR;
 	wake_up_interruptible(&cb->sem);
 }
 
 static int krping_accept(struct krping_cb *cb)
 {
 	struct rdma_conn_param conn_param;
 	int ret;
 
 	DEBUG_LOG(cb, "accepting client connection request\n");
 
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 
 	ret = rdma_accept(cb->child_cm_id, &conn_param);
 	if (ret) {
 		PRINTF(cb, "rdma_accept error: %d\n", ret);
 		return ret;
 	}
 
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
 		if (cb->state == ERROR) {
 			PRINTF(cb, "wait for CONNECTED state %d\n", 
 				cb->state);
 			return -1;
 		}
 	}
 	return 0;
 }
 
 static void krping_setup_wr(struct krping_cb *cb)
 {
 	cb->recv_sgl.addr = cb->recv_dma_addr;
 	cb->recv_sgl.length = sizeof cb->recv_buf;
 	if (cb->local_dma_lkey)
 		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
 	else if (cb->mem == DMA)
 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
 	else
 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
 	cb->rq_wr.sg_list = &cb->recv_sgl;
 	cb->rq_wr.num_sge = 1;
 
 	cb->send_sgl.addr = cb->send_dma_addr;
 	cb->send_sgl.length = sizeof cb->send_buf;
 	if (cb->local_dma_lkey)
 		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
 	else if (cb->mem == DMA)
 		cb->send_sgl.lkey = cb->dma_mr->lkey;
 	else
 		cb->send_sgl.lkey = cb->send_mr->lkey;
 
 	cb->sq_wr.opcode = IB_WR_SEND;
 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
 	cb->sq_wr.sg_list = &cb->send_sgl;
 	cb->sq_wr.num_sge = 1;
 
 	if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
 		if (cb->mem == MR)
 			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
 		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
 		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
 		cb->rdma_sq_wr.num_sge = 1;
 	}
 
 	switch(cb->mem) {
 	case FASTREG:
 
 		/* 
 		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
 		 * both unsignaled.  The client uses them to reregister
 		 * the rdma buffers with a new key each iteration.
 		 */
 		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
 		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 		cb->fastreg_wr.wr.fast_reg.length = cb->size;
 		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
 		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
 
 		cb->invalidate_wr.next = &cb->fastreg_wr;
 		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
 		break;
 	case MW:
 		cb->bind_attr.wr_id = 0xabbaabba;
 		cb->bind_attr.send_flags = 0; /* unsignaled */
 #ifdef BIND_INFO
 		cb->bind_attr.bind_info.length = cb->size;
 #else
 		cb->bind_attr.length = cb->size;
 #endif
 		break;
 	default:
 		break;
 	}
 }
 
 static int krping_setup_buffers(struct krping_cb *cb)
 {
 	int ret;
 	struct ib_phys_buf buf;
 	u64 iovbase;
 
 	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
 
 	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, 
 				   &cb->recv_buf, 
 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
 	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, 
 					   &cb->send_buf, sizeof(cb->send_buf),
 					   DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
 
 	if (cb->mem == DMA) {
 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
 					   IB_ACCESS_REMOTE_READ|
 				           IB_ACCESS_REMOTE_WRITE);
 		if (IS_ERR(cb->dma_mr)) {
 			DEBUG_LOG(cb, "reg_dmamr failed\n");
 			ret = PTR_ERR(cb->dma_mr);
 			goto bail;
 		}
 	} else {
 		if (!cb->local_dma_lkey) {
 			buf.addr = cb->recv_dma_addr;
 			buf.size = sizeof cb->recv_buf;
 			DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n",
 			    (uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->recv_dma_addr;
 			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						     IB_ACCESS_LOCAL_WRITE, 
 						     &iovbase);
 
 			if (IS_ERR(cb->recv_mr)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->recv_mr);
 				goto bail;
 			}
 
 			buf.addr = cb->send_dma_addr;
 			buf.size = sizeof cb->send_buf;
 			DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n",
 			    (uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->send_dma_addr;
 			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						     0, &iovbase);
 
 			if (IS_ERR(cb->send_mr)) {
 				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->send_mr);
 				goto bail;
 			}
 		}
 	}
 
 	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
 	if (!cb->rdma_buf) {
 		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
 		ret = -ENOMEM;
 		goto bail;
 	}
 
 	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, 
 			       cb->rdma_buf, cb->size, 
 			       DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
 	if (cb->mem != DMA) {
 		switch (cb->mem) {
 		case FASTREG:
 			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
 				PAGE_SIZE) >> PAGE_SHIFT;
 			cb->page_list = ib_alloc_fast_reg_page_list(
 						cb->pd->device, 
 						cb->page_list_len);
 			if (IS_ERR(cb->page_list)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->page_list);
 				goto bail;
 			}
 			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, 
 					cb->page_list->max_page_list_len);
 			if (IS_ERR(cb->fastreg_mr)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->fastreg_mr);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
 				" page_list_len %u\n", cb->fastreg_mr->rkey, 
 				cb->page_list, cb->page_list_len);
 			break;
 		case MW:
 			cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
 			if (IS_ERR(cb->mw)) {
 				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
 				ret = PTR_ERR(cb->mw);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
 			/*FALLTHROUGH*/
 		case MR:
 			buf.addr = cb->rdma_dma_addr;
 			buf.size = cb->size;
 			iovbase = cb->rdma_dma_addr;
 			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						IB_ACCESS_LOCAL_WRITE|
 					     IB_ACCESS_REMOTE_READ| 
 					     IB_ACCESS_REMOTE_WRITE, 
 					     &iovbase);
 			if (IS_ERR(cb->rdma_mr)) {
 				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->rdma_mr);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n",
 				(uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey);
 			break;
 		default:
 			ret = -EINVAL;
 			goto bail;
 			break;
 		}
 	}
 
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 
 		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
 		if (!cb->start_buf) {
 			DEBUG_LOG(cb, "start_buf malloc failed\n");
 			ret = -ENOMEM;
 			goto bail;
 		}
 
 		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, 
 						   cb->start_buf, cb->size, 
 						   DMA_BIDIRECTIONAL);
 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
 
 		if (cb->mem == MR || cb->mem == MW) {
 			unsigned flags = IB_ACCESS_REMOTE_READ;
 
 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 				flags |= IB_ACCESS_LOCAL_WRITE |
 					IB_ACCESS_REMOTE_WRITE;
 			}
 
 			buf.addr = cb->start_dma_addr;
 			buf.size = cb->size;
 			DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n",
 				(uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->start_dma_addr;
 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 					     flags,
 					     &iovbase);
 
 			if (IS_ERR(cb->start_mr)) {
 				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->start_mr);
 				goto bail;
 			}
 		}
 	}
 
 	krping_setup_wr(cb);
 	DEBUG_LOG(cb, "allocated & registered buffers...\n");
 	return 0;
 bail:
 	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
 		ib_dereg_mr(cb->fastreg_mr);
 	if (cb->mw && !IS_ERR(cb->mw))
 		ib_dealloc_mw(cb->mw);
 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
 		ib_dereg_mr(cb->rdma_mr);
 	if (cb->page_list && !IS_ERR(cb->page_list))
 		ib_free_fast_reg_page_list(cb->page_list);
 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
 		ib_dereg_mr(cb->dma_mr);
 	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
 		ib_dereg_mr(cb->recv_mr);
 	if (cb->send_mr && !IS_ERR(cb->send_mr))
 		ib_dereg_mr(cb->send_mr);
 	if (cb->rdma_buf)
 		kfree(cb->rdma_buf);
 	if (cb->start_buf)
 		kfree(cb->start_buf);
 	return ret;
 }
 
 static void krping_free_buffers(struct krping_cb *cb)
 {
 	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
 	
 	if (cb->dma_mr)
 		ib_dereg_mr(cb->dma_mr);
 	if (cb->send_mr)
 		ib_dereg_mr(cb->send_mr);
 	if (cb->recv_mr)
 		ib_dereg_mr(cb->recv_mr);
 	if (cb->rdma_mr)
 		ib_dereg_mr(cb->rdma_mr);
 	if (cb->start_mr)
 		ib_dereg_mr(cb->start_mr);
 	if (cb->fastreg_mr)
 		ib_dereg_mr(cb->fastreg_mr);
 	if (cb->mw)
 		ib_dealloc_mw(cb->mw);
 
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, recv_mapping),
 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, send_mapping),
 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, rdma_mapping),
 			 cb->size, DMA_BIDIRECTIONAL);
 	kfree(cb->rdma_buf);
 	if (cb->start_buf) {
 		dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, start_mapping),
 			 cb->size, DMA_BIDIRECTIONAL);
 		kfree(cb->start_buf);
 	}
 }
 
 static int krping_create_qp(struct krping_cb *cb)
 {
 	struct ib_qp_init_attr init_attr;
 	int ret;
 
 	memset(&init_attr, 0, sizeof(init_attr));
 	init_attr.cap.max_send_wr = cb->txdepth;
 	init_attr.cap.max_recv_wr = 2;
 	init_attr.cap.max_recv_sge = 1;
 	init_attr.cap.max_send_sge = 1;
 	init_attr.qp_type = IB_QPT_RC;
 	init_attr.send_cq = cb->cq;
 	init_attr.recv_cq = cb->cq;
 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 
 	if (cb->server) {
 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
 		if (!ret)
 			cb->qp = cb->child_cm_id->qp;
 	} else {
 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
 		if (!ret)
 			cb->qp = cb->cm_id->qp;
 	}
 
 	return ret;
 }
 
 static void krping_free_qp(struct krping_cb *cb)
 {
 	ib_destroy_qp(cb->qp);
 	ib_destroy_cq(cb->cq);
 	ib_dealloc_pd(cb->pd);
 }
 
 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
 {
 	int ret;
 	cb->pd = ib_alloc_pd(cm_id->device);
 	if (IS_ERR(cb->pd)) {
 		PRINTF(cb, "ib_alloc_pd failed\n");
 		return PTR_ERR(cb->pd);
 	}
 	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
 
 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
 
 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
 			      cb, cb->txdepth * 2, 0);
 	if (IS_ERR(cb->cq)) {
 		PRINTF(cb, "ib_create_cq failed\n");
 		ret = PTR_ERR(cb->cq);
 		goto err1;
 	}
 	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
 
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 		if (ret) {
 			PRINTF(cb, "ib_create_cq failed\n");
 			goto err2;
 		}
 	}
 
 	ret = krping_create_qp(cb);
 	if (ret) {
 		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
 		goto err2;
 	}
 	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
 	return 0;
 err2:
 	ib_destroy_cq(cb->cq);
 err1:
 	ib_dealloc_pd(cb->pd);
 	return ret;
 }
 
 /*
  * return the (possibly rebound) rkey for the rdma buffer.
  * FASTREG mode: invalidate and rebind via fastreg wr.
  * MW mode: rebind the MW.
  * other modes: just return the mr rkey.
  */
 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
 {
 	u32 rkey = 0xffffffff;
 	u64 p;
 	struct ib_send_wr *bad_wr;
 	int i;
 	int ret;
 
 	switch (cb->mem) {
 	case FASTREG:
 		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
 
 		/*
 		 * Update the fastreg key.
 		 */
 		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
 		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
 
 		/*
 		 * Update the fastreg WR with new buf info.
 		 */
 		if (buf == (u64)cb->start_dma_addr)
 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
 		else
 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
 		p = (u64)(buf & PAGE_MASK);
 		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; 
 		     i++, p += PAGE_SIZE) {
 			cb->page_list->page_list[i] = p;
 			DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p);
 		}
 
 		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
 			" iova_start %jx page_list_len %u\n",
 			post_inv,
 			cb->fastreg_wr.wr.fast_reg.rkey,
 			cb->fastreg_wr.wr.fast_reg.page_shift,
-			cb->fastreg_wr.wr.fast_reg.length,
+			(unsigned)cb->fastreg_wr.wr.fast_reg.length,
 			(uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start,
 			cb->fastreg_wr.wr.fast_reg.page_list_len);
 
 		if (post_inv)
 			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
 		else
 			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			cb->state = ERROR;
 		}
 		rkey = cb->fastreg_mr->rkey;
 		break;
 	case MW:
 		/*
 		 * Update the MW with new buf info.
 		 */
 		if (buf == (u64)cb->start_dma_addr) {
 #ifdef BIND_INFO
 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
 			cb->bind_attr.bind_info.mr = cb->start_mr;
 #else
 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
 			cb->bind_attr.mr = cb->start_mr;
 #endif
 		} else {
 #ifdef BIND_INFO
 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
 			cb->bind_attr.bind_info.mr = cb->rdma_mr;
 #else
 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
 			cb->bind_attr.mr = cb->rdma_mr;
 #endif
 		}
 #ifdef BIND_INFO
 		cb->bind_attr.bind_info.addr = buf;
 #else
 		cb->bind_attr.addr = buf;
 #endif
 		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n",
 #ifdef BIND_INFO
 			cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey);
 #else
 			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
 #endif
 		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
 		if (ret) {
 			PRINTF(cb, "bind mw error %d\n", ret);
 			cb->state = ERROR;
 		} else
 			rkey = cb->mw->rkey;
 		break;
 	case MR:
 		if (buf == (u64)cb->start_dma_addr)
 			rkey = cb->start_mr->rkey;
 		else
 			rkey = cb->rdma_mr->rkey;
 		break;
 	case DMA:
 		rkey = cb->dma_mr->rkey;
 		break;
 	default:
 		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
 		cb->state = ERROR;
 		break;
 	}
 	return rkey;
 }
 
 static void krping_format_send(struct krping_cb *cb, u64 buf)
 {
 	struct krping_rdma_info *info = &cb->send_buf;
 	u32 rkey;
 
 	/*
 	 * Client side will do fastreg or mw bind before
 	 * advertising the rdma buffer.  Server side
 	 * sends have no data.
 	 */
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
 		info->buf = htonll(buf);
 		info->rkey = htonl(rkey);
 		info->size = htonl(cb->size);
 		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
 			  (unsigned long long)buf, rkey, cb->size);
 	}
 }
 
 static void krping_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr, inv;
 	int ret;
 
 	while (1) {
 		/* Wait for client's Start STAG/TO/Len */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
 		if (cb->state != RDMA_READ_ADV) {
 			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
 				cb->state);
 			break;
 		}
 
 		DEBUG_LOG(cb, "server received sink adv\n");
 
 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
 
 		/* Issue RDMA Read. */
 		if (cb->read_inv)
 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 		else {
 
 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
 			if (cb->mem == FASTREG) {
 				/* 
 				 * Immediately follow the read with a 
 				 * fenced LOCAL_INV.
 				 */
 				cb->rdma_sq_wr.next = &inv;
 				memset(&inv, 0, sizeof inv);
 				inv.opcode = IB_WR_LOCAL_INV;
 				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
 				inv.send_flags = IB_SEND_FENCE;
 			}
 		}
 
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		cb->rdma_sq_wr.next = NULL;
 
 		DEBUG_LOG(cb, "server posted rdma read req \n");
 
 		/* Wait for read completion */
 		wait_event_interruptible(cb->sem, 
 					 cb->state >= RDMA_READ_COMPLETE);
 		if (cb->state != RDMA_READ_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_READ_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server received read complete\n");
 
 		/* Display data in recv buf */
 		if (cb->verbose) {
 			if (strlen(cb->rdma_buf) > 128) {
 				char msgbuf[128];
 
 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
 				PRINTF(cb, "server ping data stripped: %s\n",
 				       msgbuf);
 			} else
 				PRINTF(cb, "server ping data: %s\n",
 				       cb->rdma_buf);
 		}
 
 		/* Tell client to continue */
 		if (cb->server && cb->server_invalidate) {
 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
 		} 
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		DEBUG_LOG(cb, "server posted go ahead\n");
 
 		/* Wait for client's RDMA STAG/TO/Len */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
 		if (cb->state != RDMA_WRITE_ADV) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_ADV state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server received sink adv\n");
 
 		/* RDMA Write echo data */
 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
 		if (cb->local_dma_lkey)
 			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
 		else 
 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
 			
 		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
 			  cb->rdma_sq_wr.sg_list->lkey,
 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
 			  cb->rdma_sq_wr.sg_list->length);
 
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for completion */
 		ret = wait_event_interruptible(cb->sem, cb->state >= 
 							 RDMA_WRITE_COMPLETE);
 		if (cb->state != RDMA_WRITE_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server rdma write complete \n");
 
 		cb->state = CONNECTED;
 
 		/* Tell client to begin again */
 		if (cb->server && cb->server_invalidate) {
 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
 		} 
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		DEBUG_LOG(cb, "server posted go ahead\n");
 	}
 }
 
 static void rlat_test(struct krping_cb *cb)
 {
 	int scnt;
 	int iters = cb->count;
 	struct timeval start_tv, stop_tv;
 	int ret;
 	struct ib_wc wc;
 	struct ib_send_wr *bad_wr;
 	int ne;
 
 	scnt = 0;
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	microtime(&start_tv);
 	if (!cb->poll) {
 		cb->state = RDMA_READ_ADV;
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	}
 	while (scnt < iters) {
 
 		cb->state = RDMA_READ_ADV;
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, 
 				"Couldn't post send: ret=%d scnt %d\n",
 				ret, scnt);
 			return;
 		}
 
 		do {
 			if (!cb->poll) {
 				wait_event_interruptible(cb->sem, 
 					cb->state != RDMA_READ_ADV);
 				if (cb->state == RDMA_READ_COMPLETE) {
 					ne = 1;
 					ib_req_notify_cq(cb->cq, 
 						IB_CQ_NEXT_COMP);
 				} else {
 					ne = -1;
 				}
 			} else
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			if (cb->state == ERROR) {
 				PRINTF(cb, 
 					"state == ERROR...bailing scnt %d\n", 
 					scnt);
 				return;
 			}
 		} while (ne == 0);
 
 		if (ne < 0) {
 			PRINTF(cb, "poll CQ failed %d\n", ne);
 			return;
 		}
 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
 			PRINTF(cb, "Completion wth error at %s:\n",
 				cb->server ? "server" : "client");
 			PRINTF(cb, "Failed status %d: wr_id %d\n",
 				wc.status, (int) wc.wr_id);
 			return;
 		}
 		++scnt;
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size);
 }
 
 static void wlat_test(struct krping_cb *cb)
 {
 	int ccnt, scnt, rcnt;
 	int iters=cb->count;
 	volatile char *poll_buf = (char *) cb->start_buf;
 	char *buf = (char *)cb->rdma_buf;
 	struct timeval start_tv, stop_tv;
 	cycles_t *post_cycles_start, *post_cycles_stop;
 	cycles_t *poll_cycles_start, *poll_cycles_stop;
 	cycles_t *last_poll_cycles_start;
 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
 	int i;
 	int cycle_iters = 1000;
 
 	ccnt = 0;
 	scnt = 0;
 	rcnt = 0;
 
 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
 		GFP_KERNEL);
 	if (!last_poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	if (cycle_iters > iters)
 		cycle_iters = iters;
 	microtime(&start_tv);
 	while (scnt < iters || ccnt < iters || rcnt < iters) {
 
 		/* Wait till buffer changes. */
 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
 			++rcnt;
 			while (*poll_buf != (char)rcnt) {
 				if (cb->state == ERROR) {
 					PRINTF(cb, 
 						"state = ERROR, bailing\n");
 					return;
 				}
 			}
 		}
 
 		if (scnt < iters) {
 			struct ib_send_wr *bad_wr;
 
 			*buf = (char)scnt+1;
 			if (scnt < cycle_iters)
 				post_cycles_start[scnt] = get_cycles();
 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 				PRINTF(cb, 
 					"Couldn't post send: scnt=%d\n",
 					scnt);
 				return;
 			}
 			if (scnt < cycle_iters)
 				post_cycles_stop[scnt] = get_cycles();
 			scnt++;
 		}
 
 		if (ccnt < iters) {
 			struct ib_wc wc;
 			int ne;
 
 			if (ccnt < cycle_iters)
 				poll_cycles_start[ccnt] = get_cycles();
 			do {
 				if (ccnt < cycle_iters)
 					last_poll_cycles_start[ccnt] = 
 						get_cycles();
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			} while (ne == 0);
 			if (ccnt < cycle_iters)
 				poll_cycles_stop[ccnt] = get_cycles();
 			++ccnt;
 
 			if (ne < 0) {
 				PRINTF(cb, "poll CQ failed %d\n", ne);
 				return;
 			}
 			if (wc.status != IB_WC_SUCCESS) {
 				PRINTF(cb, 
 					"Completion wth error at %s:\n",
 					cb->server ? "server" : "client");
 				PRINTF(cb, 
 					"Failed status %d: wr_id %d\n",
 					wc.status, (int) wc.wr_id);
 				PRINTF(cb, 
 					"scnt=%d, rcnt=%d, ccnt=%d\n",
 					scnt, rcnt, ccnt);
 				return;
 			}
 		}
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	for (i=0; i < cycle_iters; i++) {
 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
 	}
 	PRINTF(cb,
 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size, cycle_iters,
 		(unsigned long long)sum_post, (unsigned long long)sum_poll, 
 		(unsigned long long)sum_last_poll);
 	kfree(post_cycles_start);
 	kfree(post_cycles_stop);
 	kfree(poll_cycles_start);
 	kfree(poll_cycles_stop);
 	kfree(last_poll_cycles_start);
 }
 
 static void bw_test(struct krping_cb *cb)
 {
 	int ccnt, scnt, rcnt;
 	int iters=cb->count;
 	struct timeval start_tv, stop_tv;
 	cycles_t *post_cycles_start, *post_cycles_stop;
 	cycles_t *poll_cycles_start, *poll_cycles_stop;
 	cycles_t *last_poll_cycles_start;
 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
 	int i;
 	int cycle_iters = 1000;
 
 	ccnt = 0;
 	scnt = 0;
 	rcnt = 0;
 
 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
 		GFP_KERNEL);
 	if (!last_poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	if (cycle_iters > iters)
 		cycle_iters = iters;
 	microtime(&start_tv);
 	while (scnt < iters || ccnt < iters) {
 
 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
 			struct ib_send_wr *bad_wr;
 
 			if (scnt < cycle_iters)
 				post_cycles_start[scnt] = get_cycles();
 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 				PRINTF(cb, 
 					"Couldn't post send: scnt=%d\n",
 					scnt);
 				return;
 			}
 			if (scnt < cycle_iters)
 				post_cycles_stop[scnt] = get_cycles();
 			++scnt;
 		}
 
 		if (ccnt < iters) {
 			int ne;
 			struct ib_wc wc;
 
 			if (ccnt < cycle_iters)
 				poll_cycles_start[ccnt] = get_cycles();
 			do {
 				if (ccnt < cycle_iters)
 					last_poll_cycles_start[ccnt] = 
 						get_cycles();
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			} while (ne == 0);
 			if (ccnt < cycle_iters)
 				poll_cycles_stop[ccnt] = get_cycles();
 			ccnt += 1;
 
 			if (ne < 0) {
 				PRINTF(cb, "poll CQ failed %d\n", ne);
 				return;
 			}
 			if (wc.status != IB_WC_SUCCESS) {
 				PRINTF(cb, 
 					"Completion wth error at %s:\n",
 					cb->server ? "server" : "client");
 				PRINTF(cb, 
 					"Failed status %d: wr_id %d\n",
 					wc.status, (int) wc.wr_id);
 				return;
 			}
 		}
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	for (i=0; i < cycle_iters; i++) {
 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
 	}
 	PRINTF(cb,
 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size, cycle_iters, 
 		(unsigned long long)sum_post, (unsigned long long)sum_poll, 
 		(unsigned long long)sum_last_poll);
 	kfree(post_cycles_start);
 	kfree(post_cycles_stop);
 	kfree(poll_cycles_start);
 	kfree(poll_cycles_stop);
 	kfree(last_poll_cycles_start);
 }
 
 static void krping_rlat_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_wlat_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	wlat_test(cb);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_bw_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		bw_test(cb);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static int fastreg_supported(struct krping_cb *cb, int server)
 {
 	struct ib_device *dev = server?cb->child_cm_id->device:
 					cb->cm_id->device;
 	struct ib_device_attr attr;
 	int ret;
 
 	ret = ib_query_device(dev, &attr);
 	if (ret) {
 		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
 		return 0;
 	}
 	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n",
 		    (unsigned long long)attr.device_cap_flags);
 		return 0;
 	}
 	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n",
 		(uintmax_t)attr.device_cap_flags);
 	return 1;
 }
 
 static int krping_bind_server(struct krping_cb *cb)
 {
 	struct sockaddr_in sin;
 	int ret;
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof sin;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = cb->addr.s_addr;
 	sin.sin_port = cb->port;
 
 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
 	if (ret) {
 		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
 		return ret;
 	}
 	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
 
 	DEBUG_LOG(cb, "rdma_listen\n");
 	ret = rdma_listen(cb->cm_id, 3);
 	if (ret) {
 		PRINTF(cb, "rdma_listen failed: %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
 	if (cb->state != CONNECT_REQUEST) {
 		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
 			cb->state);
 		return -1;
 	}
 
 	if (cb->mem == FASTREG && !fastreg_supported(cb, 1))
 		return -EINVAL;
 
 	return 0;
 }
 
 /*
  * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads
  * complete.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  */
 static void krping_fr_test5(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list **pl;
 	struct ib_send_wr *fr, *read, *bad;
 	struct ib_wc wc;
 	struct ib_sge *sgl;
 	u8 key = 0;
 	struct ib_mr **mr;
 	u8 **buf;
 	dma_addr_t *dma_addr;
 	int i;
 	int ret;
 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	time_t start;
 	int count = 0;
 	int scnt;
 	int depth = cb->txdepth >> 1;
 
 	if (!depth) {
 		PRINTF(cb, "txdepth must be > 1 for this test!\n");
 		return;
 	}
 
 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
 	read = kzalloc(sizeof *read * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth);
 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
 	if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) {
 		PRINTF(cb, "kzalloc failed\n");
 		goto err1;
 	}
 
 	for (scnt = 0; scnt < depth; scnt++) {
 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 		if (IS_ERR(pl[scnt])) {
 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
 			       PTR_ERR(pl[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
 
 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
 		if (IS_ERR(mr[scnt])) {
 			PRINTF(cb, "alloc_fr failed %ld\n",
 			       PTR_ERR(mr[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
 		ib_update_fast_reg_key(mr[scnt], ++key);
 
 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
 		if (!buf[scnt]) {
 			PRINTF(cb, "kmalloc failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
 		dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device,
 						   buf[scnt], cb->size,
 						   DMA_BIDIRECTIONAL);
 		if (dma_mapping_error(cb->pd->device->dma_device,
 		    dma_addr[scnt])) {
 			PRINTF(cb, "dma_map failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
 		for (i=0; i<plen; i++) {
 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
 		}
 
 		sgl[scnt].lkey = mr[scnt]->rkey;
 		sgl[scnt].length = cb->size;
 		sgl[scnt].addr = (u64)buf[scnt];
 		DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n",
 			  __func__, scnt,  sgl[scnt].lkey, sgl[scnt].length,
 			  (uintmax_t)sgl[scnt].addr);
 
 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
 		fr[scnt].wr_id = scnt;
 		fr[scnt].send_flags = 0;
 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
 		fr[scnt].wr.fast_reg.length = cb->size;
 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
 		fr[scnt].wr.fast_reg.page_list_len = plen;
 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
 		fr[scnt].next = &read[scnt];
 		read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV;
 		read[scnt].wr_id = scnt;
 		read[scnt].send_flags = IB_SEND_SIGNALED;
 		read[scnt].wr.rdma.rkey = cb->remote_rkey;
 		read[scnt].wr.rdma.remote_addr = cb->remote_addr;
 		read[scnt].num_sge = 1;
 		read[scnt].sg_list = &sgl[scnt];
 		ret = ib_post_send(cb->qp, &fr[scnt], &bad);
 		if (ret) {
 			PRINTF(cb, "ib_post_send failed %d\n", ret);
 			goto err2;
 		}
 	}
 
 	start = time_uptime;
 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
 	while (!cb->count || cb->server || count < cb->count) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
 				  count);
 			wait_event_interruptible_timeout(cb->sem,
 							 cb->state == ERROR,
 							 1);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n",
 				       ret);
 				goto err2;
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb,
 					       "completion error %u wr_id %ju "
 					       "opcode %d\n", wc.status,
 					       (uintmax_t)wc.wr_id, wc.opcode);
 					goto err2;
 				}
 				count++;
 				if (count == cb->count)
 					break;
 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
 				fr[wc.wr_id].wr.fast_reg.rkey =
 					mr[wc.wr_id]->rkey;
 				sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey;
 				ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad);
 				if (ret) {
 					PRINTF(cb,
 					       "ib_post_send failed %d\n", ret);
 					goto err2;
 				}
 			} else if (krping_sigpending()) {
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u "
 				       "opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 
 	DEBUG_LOG(cb, "destroying fr mrs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (mr[scnt]) {
 			ib_dereg_mr(mr[scnt]);
 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (buf[scnt]) {
 			dma_unmap_single(cb->pd->device->dma_device,
 					 dma_addr[scnt], cb->size,
 					 DMA_BIDIRECTIONAL);
 			kfree(buf[scnt]);
 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "destroying fr page lists!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (pl[scnt]) {
 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
 			ib_free_fast_reg_page_list(pl[scnt]);
 		}
 	}
 err1:
 	if (pl)
 		kfree(pl);
 	if (mr)
 		kfree(mr);
 	if (fr)
 		kfree(fr);
 	if (read)
 		kfree(read);
 	if (sgl)
 		kfree(sgl);
 	if (buf)
 		kfree(buf);
 	if (dma_addr)
 		kfree(dma_addr);
 }
 static void krping_fr_test_server(struct krping_cb *cb)
 {
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test5_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		krping_fr_test5(cb);
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test5_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to server */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
 	    (uintmax_t)cb->remote_addr);
 
 	return krping_fr_test5(cb);
 }
 
 /*
  * sq-depth worth of write + fastreg + inv, reposting them as the invs
  * complete.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  * If a count is given, then the last IO will have a bogus lkey in the
  * write work request.  This reproduces a fw bug where the connection
  * will get stuck if a fastreg is processed while the ulptx is failing
  * the bad write.
  */
 static void krping_fr_test6(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list **pl;
 	struct ib_send_wr *fr, *write, *inv, *bad;
 	struct ib_wc wc;
 	struct ib_sge *sgl;
 	u8 key = 0;
 	struct ib_mr **mr;
 	u8 **buf;
 	dma_addr_t *dma_addr;
 	int i;
 	int ret;
 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	unsigned long start;
 	int count = 0;
 	int scnt;
 	int depth = cb->txdepth  / 3;
 
 	if (!depth) {
 		PRINTF(cb, "txdepth must be > 3 for this test!\n");
 		return;
 	}
 
 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
 
 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
 
 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
 
 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
 
 	write = kzalloc(sizeof *write * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth);
 
 	inv = kzalloc(sizeof *inv * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth);
 
 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
 
 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
 
 	if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) {
 		PRINTF(cb, "kzalloc failed\n");
 		goto err1;
 	}
 
 	for (scnt = 0; scnt < depth; scnt++) {
 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 		if (IS_ERR(pl[scnt])) {
 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
 			       PTR_ERR(pl[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
 
 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
 		if (IS_ERR(mr[scnt])) {
 			PRINTF(cb, "alloc_fr failed %ld\n",
 			       PTR_ERR(mr[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
 		ib_update_fast_reg_key(mr[scnt], ++key);
 
 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
 		if (!buf[scnt]) {
 			PRINTF(cb, "kmalloc failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
 		dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device,
 						   buf[scnt], cb->size,
 						   DMA_BIDIRECTIONAL);
 		if (dma_mapping_error(cb->pd->device->dma_device,
 		    dma_addr[scnt])) {
 			PRINTF(cb, "dma_map failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
 		for (i=0; i<plen; i++) {
 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
 		}
 
 		write[scnt].opcode = IB_WR_RDMA_WRITE;
 		write[scnt].wr_id = scnt;
 		write[scnt].wr.rdma.rkey = cb->remote_rkey;
 		write[scnt].wr.rdma.remote_addr = cb->remote_addr;
 		write[scnt].num_sge = 1;
 		write[scnt].sg_list = &cb->rdma_sgl;
 		write[scnt].sg_list->length = cb->size;
 		write[scnt].next = &fr[scnt];
 
 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
 		fr[scnt].wr_id = scnt;
 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
 		fr[scnt].wr.fast_reg.length = cb->size;
 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
 		fr[scnt].wr.fast_reg.page_list_len = plen;
 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
 		fr[scnt].next = &inv[scnt];
 
 		inv[scnt].opcode = IB_WR_LOCAL_INV;
 		inv[scnt].send_flags = IB_SEND_SIGNALED;
 		inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey;
 
 		ret = ib_post_send(cb->qp, &write[scnt], &bad);
 		if (ret) {
 			PRINTF(cb, "ib_post_send failed %d\n", ret);
 			goto err2;
 		}
 	}
 
 	start = time_uptime;
 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
 	while (!cb->count || cb->server || count < cb->count) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
 				  count);
 			wait_event_interruptible_timeout(cb->sem,
 							 cb->state == ERROR,
 							 1);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n",
 				       ret);
 				goto err2;
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb,
 					       "completion error %u wr_id %ju "
 					       "opcode %d\n", wc.status,
 					       (uintmax_t)wc.wr_id, wc.opcode);
 					goto err2;
 				}
 				count++;
 				if (count == (cb->count -1))
 					cb->rdma_sgl.lkey = 0x00dead;
 				if (count == cb->count)
 					break;
 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
 				fr[wc.wr_id].wr.fast_reg.rkey =
 					mr[wc.wr_id]->rkey;
 				inv[wc.wr_id].ex.invalidate_rkey =
 					mr[wc.wr_id]->rkey;
 				ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad);
 				if (ret) {
 					PRINTF(cb,
 					       "ib_post_send failed %d\n", ret);
 					goto err2;
 				}
 			} else if (krping_sigpending()){
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u "
 				       "opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 
 	DEBUG_LOG(cb, "destroying fr mrs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (mr[scnt]) {
 			ib_dereg_mr(mr[scnt]);
 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (buf[scnt]) {
 			dma_unmap_single(cb->pd->device->dma_device,
 					 dma_addr[scnt], cb->size,
 					 DMA_BIDIRECTIONAL);
 			kfree(buf[scnt]);
 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "destroying fr page lists!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (pl[scnt]) {
 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
 			ib_free_fast_reg_page_list(pl[scnt]);
 		}
 	}
 err1:
 	if (pl)
 		kfree(pl);
 	if (mr)
 		kfree(mr);
 	if (fr)
 		kfree(fr);
 	if (write)
 		kfree(write);
 	if (inv)
 		kfree(inv);
 	if (sgl)
 		kfree(sgl);
 	if (buf)
 		kfree(buf);
 	if (dma_addr)
 		kfree(dma_addr);
 }
 
 static void krping_fr_test6_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		krping_fr_test6(cb);
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test6_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to server */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
 	    (uintmax_t)cb->remote_addr);
 
 	return krping_fr_test6(cb);
 }
 
 static void krping_run_server(struct krping_cb *cb)
 {
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	ret = krping_bind_server(cb);
 	if (ret)
 		return;
 
 	ret = krping_setup_qp(cb, cb->child_cm_id);
 	if (ret) {
 		PRINTF(cb, "setup_qp failed: %d\n", ret);
 		goto err0;
 	}
 
 	ret = krping_setup_buffers(cb);
 	if (ret) {
 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
 		goto err1;
 	}
 
 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
 		goto err2;
 	}
 
 	ret = krping_accept(cb);
 	if (ret) {
 		PRINTF(cb, "connect error %d\n", ret);
 		goto err2;
 	}
 
 	if (cb->wlat)
 		krping_wlat_test_server(cb);
 	else if (cb->rlat)
 		krping_rlat_test_server(cb);
 	else if (cb->bw)
 		krping_bw_test_server(cb);
 	else if (cb->frtest) {
 		switch (cb->testnum) {
 		case 1:
 		case 2:
 		case 3:
 		case 4:
 			krping_fr_test_server(cb);
 			break;
 		case 5:
 			krping_fr_test5_server(cb);
 			break;
 		case 6:
 			krping_fr_test6_server(cb);
 			break;
 		default:
 			PRINTF(cb, "unknown fr test %d\n", cb->testnum);
 			goto err2;
 			break;
 		}
 	} else
 		krping_test_server(cb);
 	rdma_disconnect(cb->child_cm_id);
 err2:
 	krping_free_buffers(cb);
 err1:
 	krping_free_qp(cb);
 err0:
 	rdma_destroy_id(cb->child_cm_id);
 }
 
 static void krping_test_client(struct krping_cb *cb)
 {
 	int ping, start, cc, i, ret;
 	struct ib_send_wr *bad_wr;
 	unsigned char c;
 
 	start = 65;
 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
 		cb->state = RDMA_READ_ADV;
 
 		/* Put some ascii text in the buffer. */
 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
 		for (i = cc, c = start; i < cb->size; i++) {
 			cb->start_buf[i] = c;
 			c++;
 			if (c > 122)
 				c = 65;
 		}
 		start++;
 		if (start > 122)
 			start = 65;
 		cb->start_buf[cb->size - 1] = 0;
 
 		krping_format_send(cb, cb->start_dma_addr);
 		if (cb->state == ERROR) {
 			PRINTF(cb, "krping_format_send failed\n");
 			break;
 		}
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for server to ACK */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
 		if (cb->state != RDMA_WRITE_ADV) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_ADV state %d\n",
 			       cb->state);
 			break;
 		}
 
 		krping_format_send(cb, cb->rdma_dma_addr);
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for the server to say the RDMA Write is complete. */
 		wait_event_interruptible(cb->sem, 
 					 cb->state >= RDMA_WRITE_COMPLETE);
 		if (cb->state != RDMA_WRITE_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 
 		if (cb->validate)
 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
 				PRINTF(cb, "data mismatch!\n");
 				break;
 			}
 
 		if (cb->verbose) {
 			if (strlen(cb->rdma_buf) > 128) {
 				char msgbuf[128];
 
 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
 				PRINTF(cb, "ping data stripped: %s\n",
 				       msgbuf);
 			} else
 				PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
 		}
 #ifdef SLOW_KRPING
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 #endif
 	}
 }
 
 static void krping_rlat_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 #if 0
 {
 	int i;
 	struct timeval start, stop;
 	time_t sec;
 	suseconds_t usec;
 	unsigned long long elapsed;
 	struct ib_wc wc;
 	struct ib_send_wr *bad_wr;
 	int ne;
 	
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = 0;
 	cb->rdma_sq_wr.num_sge = 0;
 
 	microtime(&start);
 	for (i=0; i < 100000; i++) {
 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 			PRINTF(cb, "Couldn't post send\n");
 			return;
 		}
 		do {
 			ne = ib_poll_cq(cb->cq, 1, &wc);
 		} while (ne == 0);
 		if (ne < 0) {
 			PRINTF(cb, "poll CQ failed %d\n", ne);
 			return;
 		}
 		if (wc.status != IB_WC_SUCCESS) {
 			PRINTF(cb, "Completion wth error at %s:\n",
 				cb->server ? "server" : "client");
 			PRINTF(cb, "Failed status %d: wr_id %d\n",
 				wc.status, (int) wc.wr_id);
 			return;
 		}
 	}
 	microtime(&stop);
 	
 	if (stop.tv_usec < start.tv_usec) {
 		stop.tv_usec += 1000000;
 		stop.tv_sec  -= 1;
 	}
 	sec     = stop.tv_sec - start.tv_sec;
 	usec    = stop.tv_usec - start.tv_usec;
 	elapsed = sec * 1000000 + usec;
 	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
 }
 #endif
 
 	rlat_test(cb);
 }
 
 static void krping_wlat_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	wlat_test(cb);
 }
 
 static void krping_bw_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	bw_test(cb);
 }
 
 
 /*
  * fastreg 2 valid different mrs and verify the completions.
  */
 static void krping_fr_test1(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1, *mr2;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 	mr2 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr2)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err2;
 	}
 
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	fr.wr.fast_reg.rkey = mr2->rkey;
 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 2);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr2!\n");
 
 	ib_dereg_mr(mr2);
 err2:
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg the same mr twice, 2nd one should produce error cqe.
  */
 static void krping_fr_test2(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 2);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg pipelined in a loop as fast as we can until the user interrupts.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  */
 static void krping_fr_test3(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, inv, *bad;
 	struct ib_wc wc;
 	u8 key = 0;
 	struct ib_mr *mr;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	unsigned long start;
 	int count = 0;
 	int scnt = 0;
 
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 	
 	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 	
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.next = &inv;
 	memset(&inv, 0, sizeof inv);
 	inv.opcode = IB_WR_LOCAL_INV;
 	inv.send_flags = IB_SEND_SIGNALED;
 	
 	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
 	start = time_uptime;
 	while (1) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
 			wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}	
 		while (scnt < (cb->txdepth>>1)) {
 			ib_update_fast_reg_key(mr, ++key);
 			fr.wr.fast_reg.rkey = mr->rkey;
 			inv.ex.invalidate_rkey = mr->rkey;
 			size = arc4random() % cb->size;
 			if (size == 0)
 				size = cb->size;
 			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 			fr.wr.fast_reg.length = size;
 			fr.wr.fast_reg.page_list_len = plen;
 			ret = ib_post_send(cb->qp, &fr, &bad);
 			if (ret) {
 				PRINTF(cb, "ib_post_send failed %d\n", ret);
 				goto err2;	
 			}
 			scnt+=2;
 		}
 
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 				goto err2;	
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb, "completion error %u\n", wc.status);
 					goto err2;
 				}
 				count++;
 				scnt--;
 			}
 			else if (krping_sigpending()) {
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "fr_test: done!\n");
 	ib_dereg_mr(mr);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg 1 and invalidate 1 mr and verify completion.
  */
 static void krping_fr_test4(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, inv, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	fr.next = &inv;
 	memset(&inv, 0, sizeof inv);
 	inv.opcode = IB_WR_LOCAL_INV;
 	inv.ex.invalidate_rkey = mr1->rkey;
 
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 1);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 static void krping_fr_test(struct krping_cb *cb)
 {
 	switch (cb->testnum) {
 	case 1:
 		krping_fr_test1(cb);
 		break;
 	case 2:
 		krping_fr_test2(cb);
 		break;
 	case 3:
 		krping_fr_test3(cb);
 		break;
 	case 4:
 		krping_fr_test4(cb);
 		break;
 	case 5:
 		krping_fr_test5_client(cb);
 		break;
 	case 6:
 		krping_fr_test6_client(cb);
 		break;
 	default:
 		PRINTF(cb, "Unkown frtest num %u\n", cb->testnum);
 		break;
 	}
 }
 
 static int krping_connect_client(struct krping_cb *cb)
 {
 	struct rdma_conn_param conn_param;
 	int ret;
 
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 	conn_param.retry_count = 10;
 
 	ret = rdma_connect(cb->cm_id, &conn_param);
 	if (ret) {
 		PRINTF(cb, "rdma_connect error %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
 		return -1;
 	}
 
 	DEBUG_LOG(cb, "rdma_connect successful\n");
 	return 0;
 }
 
 static int krping_bind_client(struct krping_cb *cb)
 {
 	struct sockaddr_in sin;
 	int ret;
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof sin;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = cb->addr.s_addr;
 	sin.sin_port = cb->port;
 
 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
 				2000);
 	if (ret) {
 		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
 	if (cb->state != ROUTE_RESOLVED) {
 		PRINTF(cb, 
 		       "addr/route resolution did not resolve: state %d\n",
 		       cb->state);
 		return -EINTR;
 	}
 
 	if (cb->mem == FASTREG && !fastreg_supported(cb, 0))
 		return -EINVAL;
 
 	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
 	return 0;
 }
 
 static void krping_run_client(struct krping_cb *cb)
 {
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	ret = krping_bind_client(cb);
 	if (ret)
 		return;
 
 	ret = krping_setup_qp(cb, cb->cm_id);
 	if (ret) {
 		PRINTF(cb, "setup_qp failed: %d\n", ret);
 		return;
 	}
 
 	ret = krping_setup_buffers(cb);
 	if (ret) {
 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
 		goto err1;
 	}
 
 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
 		goto err2;
 	}
 
 	ret = krping_connect_client(cb);
 	if (ret) {
 		PRINTF(cb, "connect error %d\n", ret);
 		goto err2;
 	}
 
 	if (cb->wlat)
 		krping_wlat_test_client(cb);
 	else if (cb->rlat)
 		krping_rlat_test_client(cb);
 	else if (cb->bw)
 		krping_bw_test_client(cb);
 	else if (cb->frtest)
 		krping_fr_test(cb);
 	else
 		krping_test_client(cb);
 	rdma_disconnect(cb->cm_id);
 err2:
 	krping_free_buffers(cb);
 err1:
 	krping_free_qp(cb);
 }
 
 int krping_doit(char *cmd, void *cookie)
 {
 	struct krping_cb *cb;
 	int op;
 	int ret = 0;
 	char *optarg;
 	unsigned long optint;
 
 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
 	if (!cb)
 		return -ENOMEM;
 
 	mutex_lock(&krping_mutex);
 	list_add_tail(&cb->list, &krping_cbs);
 	mutex_unlock(&krping_mutex);
 
 	cb->cookie = cookie;
 	cb->server = -1;
 	cb->state = IDLE;
 	cb->size = 64;
 	cb->txdepth = RPING_SQ_DEPTH;
 	cb->mem = DMA;
 	init_waitqueue_head(&cb->sem);
 
 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
 			      &optint)) != 0) {
 		switch (op) {
 		case 'a':
 			cb->addr_str = optarg;
 			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
 			if (!inet_aton(optarg, &cb->addr)) {
 				PRINTF(cb, "bad addr string %s\n",
 				    optarg);
 				ret = EINVAL;
 			}
 			break;
 		case 'p':
 			cb->port = htons(optint);
 			DEBUG_LOG(cb, "port %d\n", (int)optint);
 			break;
 		case 'P':
 			cb->poll = 1;
 			DEBUG_LOG(cb, "server\n");
 			break;
 		case 's':
 			cb->server = 1;
 			DEBUG_LOG(cb, "server\n");
 			break;
 		case 'c':
 			cb->server = 0;
 			DEBUG_LOG(cb, "client\n");
 			break;
 		case 'S':
 			cb->size = optint;
 			if ((cb->size < 1) ||
 			    (cb->size > RPING_BUFSIZE)) {
 				PRINTF(cb, "Invalid size %d "
 				       "(valid range is 1 to %d)\n",
 				       cb->size, RPING_BUFSIZE);
 				ret = EINVAL;
 			} else
 				DEBUG_LOG(cb, "size %d\n", (int)optint);
 			break;
 		case 'C':
 			cb->count = optint;
 			if (cb->count < 0) {
 				PRINTF(cb, "Invalid count %d\n",
 					cb->count);
 				ret = EINVAL;
 			} else
 				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
 			break;
 		case 'v':
 			cb->verbose++;
 			DEBUG_LOG(cb, "verbose\n");
 			break;
 		case 'V':
 			cb->validate++;
 			DEBUG_LOG(cb, "validate data\n");
 			break;
 		case 'l':
 			cb->wlat++;
 			break;
 		case 'L':
 			cb->rlat++;
 			break;
 		case 'B':
 			cb->bw++;
 			break;
 		case 'd':
 			cb->duplex++;
 			break;
 		case 'm':
 			if (!strncmp(optarg, "dma", 3))
 				cb->mem = DMA;
 			else if (!strncmp(optarg, "fastreg", 7))
 				cb->mem = FASTREG;
 			else if (!strncmp(optarg, "mw", 2))
 				cb->mem = MW;
 			else if (!strncmp(optarg, "mr", 2))
 				cb->mem = MR;
 			else {
 				PRINTF(cb, "unknown mem mode %s.  "
 					"Must be dma, fastreg, mw, or mr\n",
 					optarg);
 				ret = -EINVAL;
 				break;
 			}
 			break;
 		case 'I':
 			cb->server_invalidate = 1;
 			break;
 		case 'T':
 			cb->txdepth = optint;
 			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
 			break;
 		case 'Z':
 			cb->local_dma_lkey = 1;
 			DEBUG_LOG(cb, "using local dma lkey\n");
 			break;
 		case 'R':
 			cb->read_inv = 1;
 			DEBUG_LOG(cb, "using read-with-inv\n");
 			break;
 		case 'f':
 			cb->frtest = 1;
 			cb->testnum = optint;
 			DEBUG_LOG(cb, "fast-reg test!\n");
 			break;
 		default:
 			PRINTF(cb, "unknown opt %s\n", optarg);
 			ret = -EINVAL;
 			break;
 		}
 	}
 	if (ret)
 		goto out;
 
 	if (cb->server == -1) {
 		PRINTF(cb, "must be either client or server\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
 		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
 		ret = -EINVAL;
 		goto out;
 	}
 	if (cb->server_invalidate && cb->mem != FASTREG) {
 		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (cb->read_inv && cb->mem != FASTREG) {
 		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) {
 		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(cb->cm_id)) {
 		ret = PTR_ERR(cb->cm_id);
 		PRINTF(cb, "rdma_create_id error %d\n", ret);
 		goto out;
 	}
 	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
 
 	if (cb->server)
 		krping_run_server(cb);
 	else
 		krping_run_client(cb);
 
 	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
 	rdma_destroy_id(cb->cm_id);
 out:
 	mutex_lock(&krping_mutex);
 	list_del(&cb->list);
 	mutex_unlock(&krping_mutex);
 	kfree(cb);
 	return ret;
 }
 
 void
 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
 {
 	struct krping_cb *cb;
 
 	mutex_lock(&krping_mutex);
 	list_for_each_entry(cb, &krping_cbs, list)
 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
 	mutex_unlock(&krping_mutex);
 }
 
 void krping_init(void)
 {
 
 	mutex_init(&krping_mutex);
 }
Index: user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/core/addr.c
===================================================================
--- user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/core/addr.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/core/addr.c	(revision 298468)
@@ -1,656 +1,658 @@
 /*
  * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
  * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
  * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <linux/mutex.h>
 #include <linux/inetdevice.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <net/route.h>
 #include <net/netevent.h>
 #include <rdma/ib_addr.h>
 #include <netinet/if_ether.h>
 
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("IB Address Translation");
 MODULE_LICENSE("Dual BSD/GPL");
 
 struct addr_req {
 	struct list_head list;
 	struct sockaddr_storage src_addr;
 	struct sockaddr_storage dst_addr;
 	struct rdma_dev_addr *addr;
 	struct rdma_addr_client *client;
 	void *context;
 	void (*callback)(int status, struct sockaddr *src_addr,
 			 struct rdma_dev_addr *addr, void *context);
 	unsigned long timeout;
 	int status;
 };
 
 static void process_req(struct work_struct *work);
 
 static DEFINE_MUTEX(lock);
 static LIST_HEAD(req_list);
 static struct delayed_work work;
 static struct workqueue_struct *addr_wq;
 
 static struct rdma_addr_client self;
 void rdma_addr_register_client(struct rdma_addr_client *client)
 {
 	atomic_set(&client->refcount, 1);
 	init_completion(&client->comp);
 }
 EXPORT_SYMBOL(rdma_addr_register_client);
 
 static inline void put_client(struct rdma_addr_client *client)
 {
 	if (atomic_dec_and_test(&client->refcount))
 		complete(&client->comp);
 }
 
 void rdma_addr_unregister_client(struct rdma_addr_client *client)
 {
 	put_client(client);
 	wait_for_completion(&client->comp);
 }
 EXPORT_SYMBOL(rdma_addr_unregister_client);
 
 int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
 		     const unsigned char *dst_dev_addr)
 {
 	if (dev->if_type == IFT_INFINIBAND)
 		dev_addr->dev_type = ARPHRD_INFINIBAND;
 	else if (dev->if_type == IFT_ETHER)
 		dev_addr->dev_type = ARPHRD_ETHER;
 	else
 		dev_addr->dev_type = 0;
 	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
 	memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr),
 	    dev->if_addrlen);
 	if (dst_dev_addr)
 		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen);
 	dev_addr->bound_dev_if = dev->if_index;
 	return 0;
 }
 EXPORT_SYMBOL(rdma_copy_addr);
 
 #define	SCOPE_ID_CACHE(_scope_id, _addr6) do {		\
 	(_addr6)->sin6_addr.s6_addr[3] = (_scope_id);	\
 	(_addr6)->sin6_scope_id = 0; } while (0)
 
 #define	SCOPE_ID_RESTORE(_scope_id, _addr6) do {	\
 	(_addr6)->sin6_scope_id = (_scope_id);		\
 	(_addr6)->sin6_addr.s6_addr[3] = 0; } while (0)
 
 int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
 		      u16 *vlan_id)
 {
 	struct net_device *dev;
 	int ret = -EADDRNOTAVAIL;
 
 	if (dev_addr->bound_dev_if) {
 		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
 		if (!dev)
 			return -ENODEV;
 		ret = rdma_copy_addr(dev_addr, dev, NULL);
 		dev_put(dev);
 		return ret;
 	}
 
 	switch (addr->sa_family) {
 	case AF_INET:
 		dev = ip_dev_find(&init_net,
 			((struct sockaddr_in *) addr)->sin_addr.s_addr);
 
 		if (!dev)
 			return ret;
 
 		ret = rdma_copy_addr(dev_addr, dev, NULL);
 		if (vlan_id)
 			*vlan_id = rdma_vlan_dev_vlan_id(dev);
 		dev_put(dev);
 		break;
 
 #if defined(INET6)
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 			struct ifaddr *ifa;
 			in_port_t port;
 			uint32_t scope_id;
 
 			sin6 = (struct sockaddr_in6 *)addr;
 			port = sin6->sin6_port;
 			sin6->sin6_port = 0;
 			scope_id = sin6->sin6_scope_id;
 			if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr))
 				SCOPE_ID_CACHE(scope_id, sin6);
 			ifa = ifa_ifwithaddr(addr);
 			sin6->sin6_port = port;
 			if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr))
 				SCOPE_ID_RESTORE(scope_id, sin6);
 			if (ifa == NULL) {
 				ret = -ENODEV;
 				break;
 			}
 			ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
 			if (vlan_id)
 				*vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp);
 			ifa_free(ifa);
 			break;
 		}
 #endif
 	default:
 		break;
 	}
 	return ret;
 }
 EXPORT_SYMBOL(rdma_translate_ip);
 
 static void set_timeout(unsigned long time)
 {
 	unsigned long delay;
 
 	delay = time - jiffies;
 	if ((long)delay <= 0)
 		delay = 1;
 
 	mod_delayed_work(addr_wq, &work, delay);
 }
 
 static void queue_req(struct addr_req *req)
 {
 	struct addr_req *temp_req;
 
 	mutex_lock(&lock);
 	list_for_each_entry_reverse(temp_req, &req_list, list) {
 		if (time_after_eq(req->timeout, temp_req->timeout))
 			break;
 	}
 
 	list_add(&req->list, &temp_req->list);
 
 	if (req_list.next == &req->list)
 		set_timeout(req->timeout);
 	mutex_unlock(&lock);
 }
 
 static int addr_resolve(struct sockaddr *src_in,
 			struct sockaddr *dst_in,
 			struct rdma_dev_addr *addr)
 {
 	struct sockaddr_in *sin;
 	struct sockaddr_in6 *sin6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct rtentry *rte;
 #if defined(INET) || defined(INET6)
 	in_port_t port;
 #endif
 #ifdef INET6
 	uint32_t scope_id;
 #endif
 	u_char edst[MAX_ADDR_LEN];
 	int multi;
 	int bcast;
 	int is_gw = 0;
 	int error = 0;
 	/*
 	 * Determine whether the address is unicast, multicast, or broadcast
 	 * and whether the source interface is valid.
 	 */
 	multi = 0;
 	bcast = 0;
 	sin = NULL;
 	sin6 = NULL;
 	ifp = NULL;
 	rte = NULL;
 	ifa = NULL;
 	ifp = NULL;
 	memset(edst, 0, sizeof(edst));
 #ifdef INET6
 	scope_id = -1U;
 #endif
 
 	switch (dst_in->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)dst_in;
 		if (sin->sin_addr.s_addr == INADDR_BROADCAST)
 			bcast = 1;
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			multi = 1;
 		sin = (struct sockaddr_in *)src_in;
 		if (sin->sin_addr.s_addr != INADDR_ANY) {
 			/*
 			 * Address comparison fails if the port is set
 			 * cache it here to be restored later.
 			 */
 			port = sin->sin_port;
 			sin->sin_port = 0;
 			memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
 
 			/*
 			 * If we have a source address to use look it
 			 * up first and verify that it is a local
 			 * interface:
 			 */
 			ifa = ifa_ifwithaddr(src_in);
 			sin->sin_port = port;
 			if (ifa == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			ifp = ifa->ifa_ifp;
 			ifa_free(ifa);
 			if (bcast || multi)
 				goto mcast;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)dst_in;
 		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			multi = 1;
 		if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
 			/*
 			 * The IB address comparison fails if the
 			 * scope ID is set and not part of the addr:
 			 */
 			scope_id = sin6->sin6_scope_id;
 			if (scope_id < 256)
 				SCOPE_ID_CACHE(scope_id, sin6);
 		}
 		sin6 = (struct sockaddr_in6 *)src_in;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			port = sin6->sin6_port;
 			sin6->sin6_port = 0;
 			if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
 				if (scope_id < 256)
 					SCOPE_ID_CACHE(scope_id, sin6);
 			}
 
 			/*
 			 * If we have a source address to use look it
 			 * up first and verify that it is a local
 			 * interface:
 			 */
 			ifa = ifa_ifwithaddr(src_in);
 			sin6->sin6_port = port;
 			if (ifa == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			ifp = ifa->ifa_ifp;
 			ifa_free(ifa);
 			if (bcast || multi)
 				goto mcast;
 		}
 		break;
 #endif
 	default:
 		error = EINVAL;
 		goto done;
 	}
 	/*
 	 * Make sure the route exists and has a valid link.
 	 */
 	rte = rtalloc1(dst_in, 1, 0);
 	if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) {
 		if (rte)
 			RTFREE_LOCKED(rte);
 		error = EHOSTUNREACH;
 		goto done;
 	}
 	if (rte->rt_flags & RTF_GATEWAY)
 		is_gw = 1;
 	/*
 	 * If it's not multicast or broadcast and the route doesn't match the
 	 * requested interface return unreachable.  Otherwise fetch the
 	 * correct interface pointer and unlock the route.
 	 */
 	if (multi || bcast) {
 		if (ifp == NULL) {
 			ifp = rte->rt_ifp;
 			/* rt_ifa holds the route answer source address */
 			ifa = rte->rt_ifa;
 		}
 		RTFREE_LOCKED(rte);
 	} else if (ifp && ifp != rte->rt_ifp) {
 		RTFREE_LOCKED(rte);
 		error = ENETUNREACH;
 		goto done;
 	} else {
 		if (ifp == NULL) {
 			ifp = rte->rt_ifp;
 			ifa = rte->rt_ifa;
 		}
 		RT_UNLOCK(rte);
 	}
 #if defined(INET) || defined(INET6)
 mcast:
 #endif
 	if (bcast) {
 		memcpy(edst, ifp->if_broadcastaddr, ifp->if_addrlen);
 		goto done;
 	} else if (multi) {
 		struct sockaddr *llsa;
 		struct sockaddr_dl sdl;
 
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 
 		if (ifp->if_resolvemulti == NULL) {
 			error = EOPNOTSUPP;
 			goto done;
 		}
 		error = ifp->if_resolvemulti(ifp, &llsa, dst_in);
 		if (error == 0) {
 			memcpy(edst, LLADDR((struct sockaddr_dl *)llsa),
 			    ifp->if_addrlen);
 		}
 		goto done;
 	}
 	/*
 	 * Resolve the link local address.
 	 */
 	switch (dst_in->sa_family) {
 #ifdef INET
 	case AF_INET:
 		error = arpresolve(ifp, is_gw, NULL,
 		    is_gw ? rte->rt_gateway : dst_in, edst, NULL);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = nd6_resolve(ifp, is_gw, NULL,
 		    is_gw ? rte->rt_gateway : dst_in, edst, NULL);
 		break;
 #endif
 	default:
+		KASSERT(0, ("rdma_addr_resolve: Unreachable"));
+		error = EINVAL;
 		break;
 	}
 	RTFREE(rte);
 done:
 	if (error == 0)
 		error = -rdma_copy_addr(addr, ifp, edst);
 	if (error == 0)
 		memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr));
 #ifdef INET6
 	if (scope_id < 256) {
 		sin6 = (struct sockaddr_in6 *)src_in;
 		SCOPE_ID_RESTORE(scope_id, sin6);
 		sin6 = (struct sockaddr_in6 *)dst_in;
 		SCOPE_ID_RESTORE(scope_id, sin6);
 	}
 #endif
 	if (error == EWOULDBLOCK)
 		error = ENODATA;
 	return -error;
 }
 
 static void process_req(struct work_struct *work)
 {
 	struct addr_req *req, *temp_req;
 	struct sockaddr *src_in, *dst_in;
 	struct list_head done_list;
 
 	INIT_LIST_HEAD(&done_list);
 
 	mutex_lock(&lock);
 	list_for_each_entry_safe(req, temp_req, &req_list, list) {
 		if (req->status == -ENODATA) {
 			src_in = (struct sockaddr *) &req->src_addr;
 			dst_in = (struct sockaddr *) &req->dst_addr;
 			req->status = addr_resolve(src_in, dst_in, req->addr);
 			if (req->status && time_after_eq(jiffies, req->timeout))
 				req->status = -ETIMEDOUT;
 			else if (req->status == -ENODATA)
 				continue;
 		}
 		list_move_tail(&req->list, &done_list);
 	}
 
 	if (!list_empty(&req_list)) {
 		req = list_entry(req_list.next, struct addr_req, list);
 		set_timeout(req->timeout);
 	}
 	mutex_unlock(&lock);
 
 	list_for_each_entry_safe(req, temp_req, &done_list, list) {
 		list_del(&req->list);
 		req->callback(req->status, (struct sockaddr *) &req->src_addr,
 			req->addr, req->context);
 		put_client(req->client);
 		kfree(req);
 	}
 }
 
 int rdma_resolve_ip(struct rdma_addr_client *client,
 		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
 		    struct rdma_dev_addr *addr, int timeout_ms,
 		    void (*callback)(int status, struct sockaddr *src_addr,
 				     struct rdma_dev_addr *addr, void *context),
 		    void *context)
 {
 	struct sockaddr *src_in, *dst_in;
 	struct addr_req *req;
 	int ret = 0;
 
 	req = kzalloc(sizeof *req, GFP_KERNEL);
 	if (!req)
 		return -ENOMEM;
 
 	src_in = (struct sockaddr *) &req->src_addr;
 	dst_in = (struct sockaddr *) &req->dst_addr;
 
 	if (src_addr) {
 		if (src_addr->sa_family != dst_addr->sa_family) {
 			ret = -EINVAL;
 			goto err;
 		}
 
 		memcpy(src_in, src_addr, ip_addr_size(src_addr));
 	} else {
 		src_in->sa_family = dst_addr->sa_family;
 	}
 
 	memcpy(dst_in, dst_addr, ip_addr_size(dst_addr));
 	req->addr = addr;
 	req->callback = callback;
 	req->context = context;
 	req->client = client;
 	atomic_inc(&client->refcount);
 
 	req->status = addr_resolve(src_in, dst_in, addr);
 	switch (req->status) {
 	case 0:
 		req->timeout = jiffies;
 		queue_req(req);
 		break;
 	case -ENODATA:
 		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
 		queue_req(req);
 		break;
 	default:
 		ret = req->status;
 		atomic_dec(&client->refcount);
 		goto err;
 	}
 	return ret;
 err:
 	kfree(req);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_resolve_ip);
 
 void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
 	struct addr_req *req, *temp_req;
 
 	mutex_lock(&lock);
 	list_for_each_entry_safe(req, temp_req, &req_list, list) {
 		if (req->addr == addr) {
 			req->status = -ECANCELED;
 			req->timeout = jiffies;
 			list_move(&req->list, &req_list);
 			set_timeout(req->timeout);
 			break;
 		}
 	}
 	mutex_unlock(&lock);
 }
 EXPORT_SYMBOL(rdma_addr_cancel);
 
 struct resolve_cb_context {
 	struct rdma_dev_addr *addr;
 	struct completion comp;
 };
 
 static void resolve_cb(int status, struct sockaddr *src_addr,
 	     struct rdma_dev_addr *addr, void *context)
 {
 	memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct
 				rdma_dev_addr));
 	complete(&((struct resolve_cb_context *)context)->comp);
 }
 
 int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
 			       u16 *vlan_id)
 {
 	int ret = 0;
 	struct rdma_dev_addr dev_addr;
 	struct resolve_cb_context ctx;
 	struct net_device *dev;
 
 	union {
 		struct sockaddr     _sockaddr;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in6 _sockaddr_in6;
 	} sgid_addr, dgid_addr;
 
 
 	ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid);
 	if (ret)
 		return ret;
 
 	ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid);
 	if (ret)
 		return ret;
 
 	memset(&dev_addr, 0, sizeof(dev_addr));
 
 	ctx.addr = &dev_addr;
 	init_completion(&ctx.comp);
 	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
 			&dev_addr, 1000, resolve_cb, &ctx);
 	if (ret)
 		return ret;
 
 	wait_for_completion(&ctx.comp);
 
 	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
 	dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
 	if (!dev)
 		return -ENODEV;
 	if (vlan_id)
 		*vlan_id = rdma_vlan_dev_vlan_id(dev);
 	dev_put(dev);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
 {
 	int ret = 0;
 	struct rdma_dev_addr dev_addr;
 	union {
 		struct sockaddr     _sockaddr;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in6 _sockaddr_in6;
 	} gid_addr;
 
 	ret = rdma_gid2ip(&gid_addr._sockaddr, sgid);
 
 	if (ret)
 		return ret;
 	memset(&dev_addr, 0, sizeof(dev_addr));
 	ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
 	if (ret)
 		return ret;
 
 	memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
 
 static int netevent_callback(struct notifier_block *self, unsigned long event,
 	void *ctx)
 {
 	if (event == NETEVENT_NEIGH_UPDATE) {
 			set_timeout(jiffies);
 		}
 	return 0;
 }
 
 static struct notifier_block nb = {
 	.notifier_call = netevent_callback
 };
 
 static int __init addr_init(void)
 {
 	INIT_DELAYED_WORK(&work, process_req);
 	addr_wq = create_singlethread_workqueue("ib_addr");
 	if (!addr_wq)
 		return -ENOMEM;
 
 	register_netevent_notifier(&nb);
 	rdma_addr_register_client(&self);
 	return 0;
 }
 
 static void __exit addr_cleanup(void)
 {
 	rdma_addr_unregister_client(&self);
 	unregister_netevent_notifier(&nb);
 	destroy_workqueue(addr_wq);
 }
 
 module_init(addr_init);
 module_exit(addr_cleanup);
Index: user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	(revision 298468)
@@ -1,1452 +1,1457 @@
 /*
  * Copyright (c) 2006 Mellanox Technologies. All rights reserved
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include "ipoib.h"
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp6.h>
 
 #include <rdma/ib_cm.h>
 #include <rdma/ib_cache.h>
 #include <linux/delay.h>
 
 int ipoib_max_conn_qp = 128;
 
 module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
 MODULE_PARM_DESC(max_nonsrq_conn_qp,
 		 "Max number of connected-mode QPs per interface "
 		 "(applied only if shared receive queue is not available)");
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 static int data_debug_level;
 
 module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
 MODULE_PARM_DESC(cm_data_debug_level,
 		 "Enable data path debug tracing for connected mode if > 0");
 #endif
 
 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
 
 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
 #define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
 #define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
 
 static struct ib_qp_attr ipoib_cm_err_attr = {
 	.qp_state = IB_QPS_ERR
 };
 
 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
 
 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
 	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
 	.opcode = IB_WR_SEND,
 };
 
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event);
 
 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
 {
 
 	ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req);
 
 }
 
 static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id)
 {
 	struct ib_recv_wr *bad_wr;
 	struct ipoib_rx_buf *rx_req;
 	struct mbuf *m;
 	int ret;
 	int i;
 
 	rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id];
 	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
 		priv->cm.rx_sge[i].addr = rx_req->mapping[i];
 		priv->cm.rx_sge[i].length = m->m_len;
 	}
 
 	priv->cm.rx_wr.num_sge = i;
 	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
 		ipoib_dma_unmap_rx(priv, rx_req);
 		m_freem(priv->cm.srq_ring[id].mb);
 		priv->cm.srq_ring[id].mb = NULL;
 	}
 
 	return ret;
 }
 
 static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv,
 					struct ipoib_cm_rx *rx,
 					struct ib_recv_wr *wr,
 					struct ib_sge *sge, int id)
 {
 	struct ipoib_rx_buf *rx_req;
 	struct ib_recv_wr *bad_wr;
 	struct mbuf *m;
 	int ret;
 	int i;
 
 	rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id];
 	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
 		sge[i].addr = rx_req->mapping[i];
 		sge[i].length = m->m_len;
 	}
 
 	wr->num_sge = i;
 	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	ret = ib_post_recv(rx->qp, wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
 		ipoib_dma_unmap_rx(priv, rx_req);
 		m_freem(rx->rx_ring[id].mb);
 		rx->rx_ring[id].mb = NULL;
 	}
 
 	return ret;
 }
 
 static struct mbuf *
 ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
 {
 	return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req,
 	    priv->cm.max_cm_mtu);
 }
 
 static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv,
 				  struct ipoib_cm_rx_buf *rx_ring)
 {
 	int i;
 
 	for (i = 0; i < ipoib_recvq_size; ++i)
 		if (rx_ring[i].mb) {
 			ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]);
 			m_freem(rx_ring[i].mb);
 		}
 
 	kfree(rx_ring);
 }
 
 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
 {
 	struct ib_send_wr *bad_wr;
 	struct ipoib_cm_rx *p;
 
 	/* We only reserved 1 extra slot in CQ for drain WRs, so
 	 * make sure we have at most 1 outstanding WR. */
 	if (list_empty(&priv->cm.rx_flush_list) ||
 	    !list_empty(&priv->cm.rx_drain_list))
 		return;
 
 	/*
 	 * QPs on flush list are error state.  This way, a "flush
 	 * error" WC will be immediately generated for each WR we post.
 	 */
 	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
 	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
 		ipoib_warn(priv, "failed to post drain wr\n");
 
 	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
 }
 
 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
 {
 	struct ipoib_cm_rx *p = ctx;
 	struct ipoib_dev_priv *priv = p->priv;
 	unsigned long flags;
 
 	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
 		return;
 
 	spin_lock_irqsave(&priv->lock, flags);
 	list_move(&p->list, &priv->cm.rx_flush_list);
 	p->state = IPOIB_CM_RX_FLUSH;
 	ipoib_cm_start_rx_drain(priv);
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv,
 					   struct ipoib_cm_rx *p)
 {
 	struct ib_qp_init_attr attr = {
 		.event_handler = ipoib_cm_rx_event_handler,
 		.send_cq = priv->recv_cq, /* For drain WR */
 		.recv_cq = priv->recv_cq,
 		.srq = priv->cm.srq,
 		.cap.max_send_wr = 1, /* For drain WR */
 		.cap.max_send_sge = 1,
 		.sq_sig_type = IB_SIGNAL_ALL_WR,
 		.qp_type = IB_QPT_RC,
 		.qp_context = p,
 	};
 
 	if (!ipoib_cm_has_srq(priv)) {
 		attr.cap.max_recv_wr  = ipoib_recvq_size;
 		attr.cap.max_recv_sge = priv->cm.num_frags;
 	}
 
 	return ib_create_qp(priv->pd, &attr);
 }
 
 static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv,
 				 struct ib_cm_id *cm_id, struct ib_qp *qp,
 				 unsigned psn)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 
 	qp_attr.qp_state = IB_QPS_INIT;
 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
 		return ret;
 	}
 	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
 		return ret;
 	}
 	qp_attr.qp_state = IB_QPS_RTR;
 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
 		return ret;
 	}
 	qp_attr.rq_psn = psn;
 	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
 		return ret;
 	}
 
 	/*
 	 * Current Mellanox HCA firmware won't generate completions
 	 * with error for drain WRs unless the QP has been moved to
 	 * RTS first. This work-around leaves a window where a QP has
 	 * moved to error asynchronously, but this will eventually get
 	 * fixed in firmware, so let's not error out if modify QP
 	 * fails.
 	 */
 	qp_attr.qp_state = IB_QPS_RTS;
 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
 		return 0;
 	}
 	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
 		return 0;
 	}
 
 	return 0;
 }
 
 static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv,
 				struct ib_recv_wr *wr,
 				struct ib_sge *sge)
 {
 	int i;
 
 	for (i = 0; i < IPOIB_CM_RX_SG; i++)
 		sge[i].lkey = priv->mr->lkey;
 
 	wr->next    = NULL;
 	wr->sg_list = sge;
 	wr->num_sge = 1;
 }
 
 static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv,
     struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx)
 {
 	struct {
 		struct ib_recv_wr wr;
 		struct ib_sge sge[IPOIB_CM_RX_SG];
 	} *t;
 	int ret;
 	int i;
 
 	rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL);
 	if (!rx->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
 		       priv->ca->name, ipoib_recvq_size);
 		return -ENOMEM;
 	}
 
 	memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring);
 
 	t = kmalloc(sizeof *t, GFP_KERNEL);
 	if (!t) {
 		ret = -ENOMEM;
 		goto err_free;
 	}
 
 	ipoib_cm_init_rx_wr(priv, &t->wr, t->sge);
 
 	spin_lock_irq(&priv->lock);
 
 	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
 		spin_unlock_irq(&priv->lock);
 		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
 		ret = -EINVAL;
 		goto err_free;
 	} else
 		++priv->cm.nonsrq_conn_qp;
 
 	spin_unlock_irq(&priv->lock);
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
 		if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 				ret = -ENOMEM;
 				goto err_count;
 		}
 		ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i);
 		if (ret) {
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
 				   "failed for buf %d\n", i);
 			ret = -EIO;
 			goto err_count;
 		}
 	}
 
 	rx->recv_count = ipoib_recvq_size;
 
 	kfree(t);
 
 	return 0;
 
 err_count:
 	spin_lock_irq(&priv->lock);
 	--priv->cm.nonsrq_conn_qp;
 	spin_unlock_irq(&priv->lock);
 
 err_free:
 	kfree(t);
 	ipoib_cm_free_rx_ring(priv, rx->rx_ring);
 
 	return ret;
 }
 
 static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id,
 			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
 			     unsigned psn)
 {
 	struct ipoib_cm_data data = {};
 	struct ib_cm_rep_param rep = {};
 
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
 
 	rep.private_data = &data;
 	rep.private_data_len = sizeof data;
 	rep.flow_control = 0;
 	rep.rnr_retry_count = req->rnr_retry_count;
 	rep.srq = ipoib_cm_has_srq(priv);
 	rep.qp_num = qp->qp_num;
 	rep.starting_psn = psn;
 	return ib_send_cm_rep(cm_id, &rep);
 }
 
 static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 {
 	struct ipoib_dev_priv *priv = cm_id->context;
 	struct ipoib_cm_rx *p;
 	unsigned psn;
 	int ret;
 
 	ipoib_dbg(priv, "REQ arrived\n");
 	p = kzalloc(sizeof *p, GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 	p->priv = priv;
 	p->id = cm_id;
 	cm_id->context = p;
 	p->state = IPOIB_CM_RX_LIVE;
 	p->jiffies = jiffies;
 	INIT_LIST_HEAD(&p->list);
 
 	p->qp = ipoib_cm_create_rx_qp(priv, p);
 	if (IS_ERR(p->qp)) {
 		ret = PTR_ERR(p->qp);
 		goto err_qp;
 	}
 
 	psn = random() & 0xffffff;
 	ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn);
 	if (ret)
 		goto err_modify;
 
 	if (!ipoib_cm_has_srq(priv)) {
 		ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p);
 		if (ret)
 			goto err_modify;
 	}
 
 	spin_lock_irq(&priv->lock);
 	queue_delayed_work(ipoib_workqueue,
 			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
 	/* Add this entry to passive ids list head, but do not re-add it
 	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
 	p->jiffies = jiffies;
 	if (p->state == IPOIB_CM_RX_LIVE)
 		list_move(&p->list, &priv->cm.passive_ids);
 	spin_unlock_irq(&priv->lock);
 
 	ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn);
 	if (ret) {
 		ipoib_warn(priv, "failed to send REP: %d\n", ret);
 		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
 			ipoib_warn(priv, "unable to move qp to error state\n");
 	}
 	return 0;
 
 err_modify:
 	ib_destroy_qp(p->qp);
 err_qp:
 	kfree(p);
 	return ret;
 }
 
 static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event)
 {
 	struct ipoib_cm_rx *p;
 	struct ipoib_dev_priv *priv;
 
 	switch (event->event) {
 	case IB_CM_REQ_RECEIVED:
 		return ipoib_cm_req_handler(cm_id, event);
 	case IB_CM_DREQ_RECEIVED:
 		p = cm_id->context;
 		ib_send_cm_drep(cm_id, NULL, 0);
 		/* Fall through */
 	case IB_CM_REJ_RECEIVED:
 		p = cm_id->context;
 		priv = p->priv;
 		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
 			ipoib_warn(priv, "unable to move qp to error state\n");
 		/* Fall through */
 	default:
 		return 0;
 	}
 }
 
 void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
 {
 	struct ipoib_cm_rx_buf saverx;
 	struct ipoib_cm_rx_buf *rx_ring;
 	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
 	struct ifnet *dev = priv->dev;
 	struct mbuf *mb, *newmb;
 	struct ipoib_cm_rx *p;
 	int has_srq;
 	u_short proto;
 
+	CURVNET_SET_QUIET(dev->if_vnet);
+
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
 
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
 		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
 			spin_lock(&priv->lock);
 			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			ipoib_cm_start_rx_drain(priv);
 			if (priv->cm.id != NULL)
 				queue_work(ipoib_workqueue,
 				    &priv->cm.rx_reap_task);
 			spin_unlock(&priv->lock);
 		} else
 			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
 				   wr_id, ipoib_recvq_size);
-		return;
+		goto done;
 	}
 
 	p = wc->qp->qp_context;
 
 	has_srq = ipoib_cm_has_srq(priv);
 	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
 
 	mb = rx_ring[wr_id].mb;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		ipoib_dbg(priv, "cm recv error "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 		if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
 		if (has_srq)
 			goto repost;
 		else {
 			if (!--p->recv_count) {
 				spin_lock(&priv->lock);
 				list_move(&p->list, &priv->cm.rx_reap_list);
 				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
 				spin_unlock(&priv->lock);
 			}
-			return;
+			goto done;
 		}
 	}
 
 	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
 		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
 			p->jiffies = jiffies;
 			/* Move this entry to list head, but do not re-add it
 			 * if it has been moved out of list. */
 			if (p->state == IPOIB_CM_RX_LIVE)
 				list_move(&p->list, &priv->cm.passive_ids);
 		}
 	}
 
 	memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx));
 	newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]);
 	if (unlikely(!newmb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
 		 * this packet and reuse the old buffer.
 		 */
 		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
 		if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
 		memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx));
 		goto repost;
 	}
 
 	ipoib_cm_dma_unmap_rx(priv, &saverx);
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
 	ipoib_dma_mb(priv, mb, wc->byte_len);
 
 	if_inc_counter(dev, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len);
 
 	mb->m_pkthdr.rcvif = dev;
 	proto = *mtod(mb, uint16_t *);
 	m_adj(mb, IPOIB_ENCAP_LEN);
 
 	IPOIB_MTAP_PROTO(dev, mb, proto);
 	ipoib_demux(dev, mb, ntohs(proto));
 
 repost:
 	if (has_srq) {
 		if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id)))
 			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
 				   "for buf %d\n", wr_id);
 	} else {
 		if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p,
 							  &priv->cm.rx_wr,
 							  priv->cm.rx_sge,
 							  wr_id))) {
 			--p->recv_count;
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
 				   "for buf %d\n", wr_id);
 		}
 	}
+done:
+	CURVNET_RESTORE();
+	return;
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    struct ipoib_cm_tx *tx,
 			    struct ipoib_cm_tx_buf *tx_req,
 			    unsigned int wr_id)
 {
 	struct ib_send_wr *bad_wr;
 	struct mbuf *mb = tx_req->mb;
 	u64 *mapping = tx_req->mapping;
 	struct mbuf *m;
 	int i;
 
 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
 		priv->tx_sge[i].addr = mapping[i];
 		priv->tx_sge[i].length = m->m_len;
 	}
 	priv->tx_wr.num_sge = i;
 	priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
 	priv->tx_wr.opcode = IB_WR_SEND;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
 
 void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_cm_tx_buf *tx_req;
 	struct ifnet *dev = priv->dev;
 
 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
 		while (ipoib_poll_tx(priv)); /* nothing */
 
 	m_adj(mb, sizeof(struct ipoib_pseudoheader));
 	if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
 			   mb->m_pkthdr.len, tx->mtu);
 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
 		ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu));
 		return;
 	}
 
 	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
 		       tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num);
 
 
 	/*
 	 * We put the mb into the tx_ring _before_ we call post_send()
 	 * because it's entirely possible that the completion handler will
 	 * run before we execute anything after the post_send().  That
 	 * means we have to make sure everything is properly recorded and
 	 * our state is consistent before we call post_send().
 	 */
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->mb = mb;
 	if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req,
 	    priv->cm.num_frags))) {
 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
 		if (tx_req->mb)
 			m_freem(tx_req->mb);
 		return;
 	}
 
 	if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) {
 		ipoib_warn(priv, "post_send failed\n");
 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
 		ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
 		m_freem(mb);
 	} else {
 		++tx->tx_head;
 
 		if (++priv->tx_outstanding == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
 				  tx->qp->qp_num);
 			if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
 				ipoib_warn(priv, "request notify on send CQ failed\n");
 			dev->if_drv_flags |= IFF_DRV_OACTIVE;
 		}
 	}
 
 }
 
 void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
 {
 	struct ipoib_cm_tx *tx = wc->qp->qp_context;
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
 	struct ifnet *dev = priv->dev;
 	struct ipoib_cm_tx_buf *tx_req;
 
 	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
 		       wr_id, wc->status);
 
 	if (unlikely(wr_id >= ipoib_sendq_size)) {
 		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
 			   wr_id, ipoib_sendq_size);
 		return;
 	}
 
 	tx_req = &tx->tx_ring[wr_id];
 
 	ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	if_inc_counter(dev, IFCOUNTER_OPACKETS, 1);
 
 	m_freem(tx_req->mb);
 
 	++tx->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 	    (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR) {
 		struct ipoib_path *path;
 
 		ipoib_dbg(priv, "failed cm send event "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 
 		path = tx->path;
 
 		if (path) {
 			path->cm = NULL;
 			rb_erase(&path->rb_node, &priv->path_tree);
 			list_del(&path->list);
 		}
 
 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 			list_move(&tx->list, &priv->cm.reap_list);
 			queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		}
 
 		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
 	}
 
 }
 
 int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
 {
 	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)))
 		return 0;
 
 	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv);
 	if (IS_ERR(priv->cm.id)) {
 		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
 		ret = PTR_ERR(priv->cm.id);
 		goto err_cm;
 	}
 
 	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
 			   0, NULL);
 	if (ret) {
 		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
 		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
 		goto err_listen;
 	}
 
 	return 0;
 
 err_listen:
 	ib_destroy_cm_id(priv->cm.id);
 err_cm:
 	priv->cm.id = NULL;
 	return ret;
 }
 
 static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_cm_rx *rx, *n;
 	LIST_HEAD(list);
 
 	spin_lock_irq(&priv->lock);
 	list_splice_init(&priv->cm.rx_reap_list, &list);
 	spin_unlock_irq(&priv->lock);
 
 	list_for_each_entry_safe(rx, n, &list, list) {
 		ib_destroy_cm_id(rx->id);
 		ib_destroy_qp(rx->qp);
 		if (!ipoib_cm_has_srq(priv)) {
 			ipoib_cm_free_rx_ring(priv, rx->rx_ring);
 			spin_lock_irq(&priv->lock);
 			--priv->cm.nonsrq_conn_qp;
 			spin_unlock_irq(&priv->lock);
 		}
 		kfree(rx);
 	}
 }
 
 void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_cm_rx *p;
 	unsigned long begin;
 	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id)
 		return;
 
 	ib_destroy_cm_id(priv->cm.id);
 	priv->cm.id = NULL;
 
 	cancel_work_sync(&priv->cm.rx_reap_task);
 
 	spin_lock_irq(&priv->lock);
 	while (!list_empty(&priv->cm.passive_ids)) {
 		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
 		list_move(&p->list, &priv->cm.rx_error_list);
 		p->state = IPOIB_CM_RX_ERROR;
 		spin_unlock_irq(&priv->lock);
 		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
 		if (ret)
 			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
 		spin_lock_irq(&priv->lock);
 	}
 
 	/* Wait for all RX to be drained */
 	begin = jiffies;
 
 	while (!list_empty(&priv->cm.rx_error_list) ||
 	       !list_empty(&priv->cm.rx_flush_list) ||
 	       !list_empty(&priv->cm.rx_drain_list)) {
 		if (time_after(jiffies, begin + 5 * HZ)) {
 			ipoib_warn(priv, "RX drain timing out\n");
 
 			/*
 			 * assume the HW is wedged and just free up everything.
 			 */
 			list_splice_init(&priv->cm.rx_flush_list,
 					 &priv->cm.rx_reap_list);
 			list_splice_init(&priv->cm.rx_error_list,
 					 &priv->cm.rx_reap_list);
 			list_splice_init(&priv->cm.rx_drain_list,
 					 &priv->cm.rx_reap_list);
 			break;
 		}
 		spin_unlock_irq(&priv->lock);
 		msleep(1);
 		ipoib_drain_cq(priv);
 		spin_lock_irq(&priv->lock);
 	}
 
 	spin_unlock_irq(&priv->lock);
 
 	ipoib_cm_free_rx_reap_list(priv);
 
 	cancel_delayed_work(&priv->cm.stale_task);
 }
 
 static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 {
 	struct ipoib_cm_tx *p = cm_id->context;
 	struct ipoib_dev_priv *priv = p->priv;
 	struct ipoib_cm_data *data = event->private_data;
 	struct ifqueue mbqueue;
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 	struct mbuf *mb;
 
 	ipoib_dbg(priv, "cm rep handler\n");
 	p->mtu = be32_to_cpu(data->mtu);
 
 	if (p->mtu <= IPOIB_ENCAP_LEN) {
 		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
 			   p->mtu, IPOIB_ENCAP_LEN);
 		return -EINVAL;
 	}
 
 	qp_attr.qp_state = IB_QPS_RTR;
 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
 		return ret;
 	}
 
 	qp_attr.rq_psn = 0 /* FIXME */;
 	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
 		return ret;
 	}
 
 	qp_attr.qp_state = IB_QPS_RTS;
 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
 		return ret;
 	}
 	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
 		return ret;
 	}
 
 	bzero(&mbqueue, sizeof(mbqueue));
 
 	spin_lock_irq(&priv->lock);
 	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
 	if (p->path)
 		for (;;) {
 			_IF_DEQUEUE(&p->path->queue, mb);
 			if (mb == NULL)
 				break;
 			_IF_ENQUEUE(&mbqueue, mb);
 		}
 	spin_unlock_irq(&priv->lock);
 
 	for (;;) {
 		struct ifnet *dev = p->priv->dev;
 		_IF_DEQUEUE(&mbqueue, mb);
 		if (mb == NULL)
 			break;
 		mb->m_pkthdr.rcvif = dev;
 		if (dev->if_transmit(dev, mb))
 			ipoib_warn(priv, "dev_queue_xmit failed "
 				   "to requeue packet\n");
 	}
 
 	ret = ib_send_cm_rtu(cm_id, NULL, 0);
 	if (ret) {
 		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
 		return ret;
 	}
 	return 0;
 }
 
 static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv,
     struct ipoib_cm_tx *tx)
 {
 	struct ib_qp_init_attr attr = {
 		.send_cq		= priv->send_cq,
 		.recv_cq		= priv->recv_cq,
 		.srq			= priv->cm.srq,
 		.cap.max_send_wr	= ipoib_sendq_size,
 		.cap.max_send_sge	= priv->cm.num_frags,
 		.sq_sig_type		= IB_SIGNAL_ALL_WR,
 		.qp_type		= IB_QPT_RC,
 		.qp_context		= tx
 	};
 
 	return ib_create_qp(priv->pd, &attr);
 }
 
 static int ipoib_cm_send_req(struct ipoib_dev_priv *priv,
 			     struct ib_cm_id *id, struct ib_qp *qp,
 			     u32 qpn,
 			     struct ib_sa_path_rec *pathrec)
 {
 	struct ipoib_cm_data data = {};
 	struct ib_cm_req_param req = {};
 
 	ipoib_dbg(priv, "cm send req\n");
 
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
 
 	req.primary_path		= pathrec;
 	req.alternate_path		= NULL;
 	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
 	req.qp_num			= qp->qp_num;
 	req.qp_type			= qp->qp_type;
 	req.private_data		= &data;
 	req.private_data_len		= sizeof data;
 	req.flow_control		= 0;
 
 	req.starting_psn		= 0; /* FIXME */
 
 	/*
 	 * Pick some arbitrary defaults here; we could make these
 	 * module parameters if anyone cared about setting them.
 	 */
 	req.responder_resources		= 4;
 	req.remote_cm_response_timeout	= 20;
 	req.local_cm_response_timeout	= 20;
 	req.retry_count			= 0; /* RFC draft warns against retries */
 	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
 	req.max_cm_retries		= 15;
 	req.srq				= ipoib_cm_has_srq(priv);
 	return ib_send_cm_req(id, &req);
 }
 
 static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv,
 				  struct ib_cm_id *cm_id, struct ib_qp *qp)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
 	if (ret) {
 		ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
 		return ret;
 	}
 
 	qp_attr.qp_state = IB_QPS_INIT;
 	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
 	qp_attr.port_num = priv->port;
 	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
 
 	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
 		return ret;
 	}
 	return 0;
 }
 
 static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
 			    struct ib_sa_path_rec *pathrec)
 {
 	struct ipoib_dev_priv *priv = p->priv;
 	int ret;
 
 	p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL);
 	if (!p->tx_ring) {
 		ipoib_warn(priv, "failed to allocate tx ring\n");
 		ret = -ENOMEM;
 		goto err_tx;
 	}
 	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
 
 	p->qp = ipoib_cm_create_tx_qp(p->priv, p);
 	if (IS_ERR(p->qp)) {
 		ret = PTR_ERR(p->qp);
 		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
 		goto err_qp;
 	}
 
 	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
 	if (IS_ERR(p->id)) {
 		ret = PTR_ERR(p->id);
 		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
 		goto err_id;
 	}
 
 	ret = ipoib_cm_modify_tx_init(p->priv, p->id,  p->qp);
 	if (ret) {
 		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
 		goto err_modify;
 	}
 
 	ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec);
 	if (ret) {
 		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
 		goto err_send_cm;
 	}
 
 	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
 		  p->qp->qp_num, pathrec->dgid.raw, qpn);
 
 	return 0;
 
 err_send_cm:
 err_modify:
 	ib_destroy_cm_id(p->id);
 err_id:
 	p->id = NULL;
 	ib_destroy_qp(p->qp);
 err_qp:
 	p->qp = NULL;
 	kfree(p->tx_ring);
 err_tx:
 	return ret;
 }
 
 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = p->priv;
 	struct ifnet *dev = priv->dev;
 	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long begin;
 
 	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
 		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
 
 	if (p->path)
 		ipoib_path_free(priv, p->path);
 
 	if (p->id)
 		ib_destroy_cm_id(p->id);
 
 	if (p->tx_ring) {
 		/* Wait for all sends to complete */
 		begin = jiffies;
 		while ((int) p->tx_tail - (int) p->tx_head < 0) {
 			if (time_after(jiffies, begin + 5 * HZ)) {
 				ipoib_warn(priv, "timing out; %d sends not completed\n",
 					   p->tx_head - p->tx_tail);
 				goto timeout;
 			}
 
 			msleep(1);
 		}
 	}
 
 timeout:
 
 	while ((int) p->tx_tail - (int) p->tx_head < 0) {
 		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
 		ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
 		m_freem(tx_req->mb);
 		++p->tx_tail;
 		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 		    (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
 		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 			dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	}
 
 	if (p->qp)
 		ib_destroy_qp(p->qp);
 
 	kfree(p->tx_ring);
 	kfree(p);
 }
 
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event)
 {
 	struct ipoib_cm_tx *tx = cm_id->context;
 	struct ipoib_dev_priv *priv = tx->priv;
 	struct ipoib_path *path;
 	unsigned long flags;
 	int ret;
 
 	switch (event->event) {
 	case IB_CM_DREQ_RECEIVED:
 		ipoib_dbg(priv, "DREQ received.\n");
 		ib_send_cm_drep(cm_id, NULL, 0);
 		break;
 	case IB_CM_REP_RECEIVED:
 		ipoib_dbg(priv, "REP received.\n");
 		ret = ipoib_cm_rep_handler(cm_id, event);
 		if (ret)
 			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
 				       NULL, 0, NULL, 0);
 		break;
 	case IB_CM_REQ_ERROR:
 	case IB_CM_REJ_RECEIVED:
 	case IB_CM_TIMEWAIT_EXIT:
 		ipoib_dbg(priv, "CM error %d.\n", event->event);
 		spin_lock_irqsave(&priv->lock, flags);
 		path = tx->path;
 
 		if (path) {
 			path->cm = NULL;
 			tx->path = NULL;
 			rb_erase(&path->rb_node, &priv->path_tree);
 			list_del(&path->list);
 		}
 
 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 			list_move(&tx->list, &priv->cm.reap_list);
 			queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		}
 
 		spin_unlock_irqrestore(&priv->lock, flags);
 		if (path)
 			ipoib_path_free(tx->priv, path);
 		break;
 	default:
 		break;
 	}
 
 	return 0;
 }
 
 struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
     struct ipoib_path *path)
 {
 	struct ipoib_cm_tx *tx;
 
 	tx = kzalloc(sizeof *tx, GFP_ATOMIC);
 	if (!tx)
 		return NULL;
 
 	ipoib_dbg(priv, "Creating cm tx\n");
 	path->cm = tx;
 	tx->path = path;
 	tx->priv = priv;
 	list_add(&tx->list, &priv->cm.start_list);
 	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
 	queue_work(ipoib_workqueue, &priv->cm.start_task);
 	return tx;
 }
 
 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = tx->priv;
 	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 		spin_lock(&priv->lock);
 		list_move(&tx->list, &priv->cm.reap_list);
 		spin_unlock(&priv->lock);
 		queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
 			  tx->path->pathrec.dgid.raw);
 		tx->path = NULL;
 	}
 }
 
 static void ipoib_cm_tx_start(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.start_task);
 	struct ipoib_path *path;
 	struct ipoib_cm_tx *p;
 	unsigned long flags;
 	int ret;
 
 	struct ib_sa_path_rec pathrec;
 	u32 qpn;
 
 	ipoib_dbg(priv, "cm start task\n");
 	spin_lock_irqsave(&priv->lock, flags);
 
 	while (!list_empty(&priv->cm.start_list)) {
 		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
 		list_del_init(&p->list);
 		path = p->path;
 		qpn = IPOIB_QPN(path->hwaddr);
 		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
 
 		spin_unlock_irqrestore(&priv->lock, flags);
 
 		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
 
 		spin_lock_irqsave(&priv->lock, flags);
 
 		if (ret) {
 			path = p->path;
 			if (path) {
 				path->cm = NULL;
 				rb_erase(&path->rb_node, &priv->path_tree);
 				list_del(&path->list);
 				ipoib_path_free(priv, path);
 			}
 			list_del(&p->list);
 			kfree(p);
 		}
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static void ipoib_cm_tx_reap(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.reap_task);
 	struct ipoib_cm_tx *p;
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->lock, flags);
 
 	while (!list_empty(&priv->cm.reap_list)) {
 		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
 		list_del(&p->list);
 		spin_unlock_irqrestore(&priv->lock, flags);
 		ipoib_cm_tx_destroy(p);
 		spin_lock_irqsave(&priv->lock, flags);
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static void ipoib_cm_mb_reap(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.mb_task);
 	struct mbuf *mb;
 	unsigned long flags;
 #if defined(INET) || defined(INET6)
 	unsigned mtu = priv->mcast_mtu;
 #endif
 	uint16_t proto;
 
 	spin_lock_irqsave(&priv->lock, flags);
 
 	for (;;) {
 		IF_DEQUEUE(&priv->cm.mb_queue, mb);
 		if (mb == NULL)
 			break;
 		spin_unlock_irqrestore(&priv->lock, flags);
 
 		proto = htons(*mtod(mb, uint16_t *));
 		m_adj(mb, IPOIB_ENCAP_LEN);
 		switch (proto) {
 #if defined(INET)
 		case ETHERTYPE_IP:
 			icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
 			break;
 #endif
 #if defined(INET6)
 		case ETHERTYPE_IPV6:
 			icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu);
 			break;
 #endif
 		default:
 			m_freem(mb);
 		}
 
 		spin_lock_irqsave(&priv->lock, flags);
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 void
 ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu)
 {
 	int e = priv->cm.mb_queue.ifq_len; 
 
 	IF_ENQUEUE(&priv->cm.mb_queue, mb);
 	if (e == 0)
 		queue_work(ipoib_workqueue, &priv->cm.mb_task);
 }
 
 static void ipoib_cm_rx_reap(struct work_struct *work)
 {
 	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
 						cm.rx_reap_task));
 }
 
 static void ipoib_cm_stale_task(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.stale_task.work);
 	struct ipoib_cm_rx *p;
 	int ret;
 
 	spin_lock_irq(&priv->lock);
 	while (!list_empty(&priv->cm.passive_ids)) {
 		/* List is sorted by LRU, start from tail,
 		 * stop when we see a recently used entry */
 		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
 		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
 			break;
 		list_move(&p->list, &priv->cm.rx_error_list);
 		p->state = IPOIB_CM_RX_ERROR;
 		spin_unlock_irq(&priv->lock);
 		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
 		if (ret)
 			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
 		spin_lock_irq(&priv->lock);
 	}
 
 	if (!list_empty(&priv->cm.passive_ids))
 		queue_delayed_work(ipoib_workqueue,
 				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
 	spin_unlock_irq(&priv->lock);
 }
 
 
 static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge)
 {
 	struct ib_srq_init_attr srq_init_attr = {
 		.attr = {
 			.max_wr  = ipoib_recvq_size,
 			.max_sge = max_sge
 		}
 	};
 
 	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
 	if (IS_ERR(priv->cm.srq)) {
 		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
 			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
 			       priv->ca->name, PTR_ERR(priv->cm.srq));
 		priv->cm.srq = NULL;
 		return;
 	}
 
 	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL);
 	if (!priv->cm.srq_ring) {
 		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
 		       priv->ca->name, ipoib_recvq_size);
 		ib_destroy_srq(priv->cm.srq);
 		priv->cm.srq = NULL;
 		return;
 	}
 
 	memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);
 }
 
 int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
 {
 	struct ifnet *dev = priv->dev;
 	int i, ret;
 	struct ib_device_attr attr;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
 	INIT_LIST_HEAD(&priv->cm.reap_list);
 	INIT_LIST_HEAD(&priv->cm.start_list);
 	INIT_LIST_HEAD(&priv->cm.rx_error_list);
 	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
 	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
 	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
 	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
 	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
 	INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap);
 	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
 	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
 
 	bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue));
 	mtx_init(&priv->cm.mb_queue.ifq_mtx,
 	    dev->if_xname, "if send queue", MTX_DEF);
 
 	ret = ib_query_device(priv->ca, &attr);
 	if (ret) {
 		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
 		return ret;
 	}
 
 	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
 
 	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
 	ipoib_cm_create_srq(priv, attr.max_srq_sge);
 	if (ipoib_cm_has_srq(priv)) {
 		priv->cm.max_cm_mtu = attr.max_srq_sge * MJUMPAGESIZE;
 		priv->cm.num_frags  = attr.max_srq_sge;
 		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
 			  priv->cm.max_cm_mtu, priv->cm.num_frags);
 	} else {
 		priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU;
 		priv->cm.num_frags  = IPOIB_CM_RX_SG;
 	}
 
 	ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge);
 
 	if (ipoib_cm_has_srq(priv)) {
 		for (i = 0; i < ipoib_recvq_size; ++i) {
 			if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) {
 				ipoib_warn(priv, "failed to allocate "
 					   "receive buffer %d\n", i);
 				ipoib_cm_dev_cleanup(priv);
 				return -ENOMEM;
 			}
 
 			if (ipoib_cm_post_receive_srq(priv, i)) {
 				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
 					   "failed for buf %d\n", i);
 				ipoib_cm_dev_cleanup(priv);
 				return -EIO;
 			}
 		}
 	}
 
 	IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC;
 	return 0;
 }
 
 void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
 {
 	int ret;
 
 	if (!priv->cm.srq)
 		return;
 
 	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
 
 	ret = ib_destroy_srq(priv->cm.srq);
 	if (ret)
 		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
 
 	priv->cm.srq = NULL;
 	if (!priv->cm.srq_ring)
 		return;
 
 	ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring);
 	priv->cm.srq_ring = NULL;
 
 	mtx_destroy(&priv->cm.mb_queue.ifq_mtx);
 }
 
 #endif /* CONFIG_INFINIBAND_IPOIB_CM */
Index: user/ngie/bsnmp_cleanup/sys/ufs/ufs/ufs_extattr.c
===================================================================
--- user/ngie/bsnmp_cleanup/sys/ufs/ufs/ufs_extattr.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/sys/ufs/ufs/ufs_extattr.c	(revision 298468)
@@ -1,1298 +1,1300 @@
 /*-
  * Copyright (c) 1999-2002 Robert N. M. Watson
  * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Support for filesystem extended attribute: UFS-specific support functions.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ufs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/lock.h>
 #include <sys/dirent.h>
 #include <sys/extattr.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/uma.h>
 
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #ifdef UFS_EXTATTR
 
+FEATURE(ufs_extattr, "ufs extended attribute support");
+
 static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute");
 
 static int ufs_extattr_sync = 0;
 SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync,
     0, "");
 
 static int	ufs_extattr_valid_attrname(int attrnamespace,
 		    const char *attrname);
 static int	ufs_extattr_enable_with_open(struct ufsmount *ump,
 		    struct vnode *vp, int attrnamespace, const char *attrname,
 		    struct thread *td);
 static int	ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
 		    const char *attrname, struct vnode *backing_vnode,
 		    struct thread *td);
 static int	ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
 		    const char *attrname, struct thread *td);
 static int	ufs_extattr_get(struct vnode *vp, int attrnamespace,
 		    const char *name, struct uio *uio, size_t *size,
 		    struct ucred *cred, struct thread *td);
 static int	ufs_extattr_set(struct vnode *vp, int attrnamespace,
 		    const char *name, struct uio *uio, struct ucred *cred,
 		    struct thread *td);
 static int	ufs_extattr_rm(struct vnode *vp, int attrnamespace,
 		    const char *name, struct ucred *cred, struct thread *td);
 #ifdef UFS_EXTATTR_AUTOSTART
 static int	ufs_extattr_autostart_locked(struct mount *mp,
 		    struct thread *td);
 #endif
 static int	ufs_extattr_start_locked(struct ufsmount *ump,
 		    struct thread *td);
 
 /*
  * Per-FS attribute lock protecting attribute operations.
  *
  * XXXRW: Perhaps something more fine-grained would be appropriate, but at
  * the end of the day we're going to contend on the vnode lock for the
  * backing file anyway.
  */
 static void
 ufs_extattr_uepm_lock(struct ufsmount *ump)
 {
 
 	sx_xlock(&ump->um_extattr.uepm_lock);
 }
 
 static void
 ufs_extattr_uepm_unlock(struct ufsmount *ump)
 {
 
 	sx_xunlock(&ump->um_extattr.uepm_lock);
 }
 
 /*-
  * Determine whether the name passed is a valid name for an actual
  * attribute.
  *
  * Invalid currently consists of:
  *	 NULL pointer for attrname
  *	 zero-length attrname (used to retrieve application attribute list)
  */
 static int
 ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
 {
 
 	if (attrname == NULL)
 		return (0);
 	if (strlen(attrname) == 0)
 		return (0);
 	return (1);
 }
 
 /*
  * Locate an attribute given a name and mountpoint.
  * Must be holding uepm lock for the mount point.
  */
 static struct ufs_extattr_list_entry *
 ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
     const char *attrname)
 {
 	struct ufs_extattr_list_entry *search_attribute;
 
 	sx_assert(&ump->um_extattr.uepm_lock, SA_XLOCKED);
 
 	for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
 	    search_attribute != NULL;
 	    search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
 		if (!(strncmp(attrname, search_attribute->uele_attrname,
 		    UFS_EXTATTR_MAXEXTATTRNAME)) &&
 		    (attrnamespace == search_attribute->uele_attrnamespace)) {
 			return (search_attribute);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Initialize per-FS structures supporting extended attributes.  Do not
  * start extended attributes yet.
  */
 void
 ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
 {
 
 	uepm->uepm_flags = 0;
 	LIST_INIT(&uepm->uepm_list);
 	sx_init(&uepm->uepm_lock, "ufs_extattr_sx");
 	uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
 }
 
 /*
  * Destroy per-FS structures supporting extended attributes.  Assumes
  * that EAs have already been stopped, and will panic if not.
  */
 void
 ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
 {
 
 	if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
 		panic("ufs_extattr_uepm_destroy: not initialized");
 
 	if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		panic("ufs_extattr_uepm_destroy: called while still started");
 
 	/*
 	 * It's not clear that either order for the next two lines is
 	 * ideal, and it should never be a problem if this is only called
 	 * during unmount, and with vfs_busy().
 	 */
 	uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
 	sx_destroy(&uepm->uepm_lock);
 }
 
 /*
  * Start extended attribute support on an FS.
  */
 int
 ufs_extattr_start(struct mount *mp, struct thread *td)
 {
 	struct ufsmount *ump;
 	int error = 0;
 
 	ump = VFSTOUFS(mp);
 
 	ufs_extattr_uepm_lock(ump);
 	error = ufs_extattr_start_locked(ump, td);
 	ufs_extattr_uepm_unlock(ump);
 	return (error);
 }
 
 static int
 ufs_extattr_start_locked(struct ufsmount *ump, struct thread *td)
 {
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
 		return (EOPNOTSUPP);
 	if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)
 		return (EBUSY);
 
 	ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
 	ump->um_extattr.uepm_ucred = crhold(td->td_ucred);
 	return (0);
 }
 
 #ifdef UFS_EXTATTR_AUTOSTART
 /*
  * Helper routine: given a locked parent directory and filename, return
  * the locked vnode of the inode associated with the name.  Will not
  * follow symlinks, may return any type of vnode.  Lock on parent will
  * be released even in the event of a failure.  In the event that the
  * target is the parent (i.e., "."), there will be two references and
  * one lock, requiring the caller to possibly special-case.
  */
 #define	UE_GETDIR_LOCKPARENT	1
 #define	UE_GETDIR_LOCKPARENT_DONT	2
 static int
 ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname,
     struct vnode **vp, struct thread *td)
 {
 	struct vop_cachedlookup_args vargs;
 	struct componentname cnp;
 	struct vnode *target_vp;
 	int error;
 
 	bzero(&cnp, sizeof(cnp));
 	cnp.cn_nameiop = LOOKUP;
 	cnp.cn_flags = ISLASTCN;
 	if (lockparent == UE_GETDIR_LOCKPARENT)
 		cnp.cn_flags |= LOCKPARENT;
 	cnp.cn_lkflags = LK_EXCLUSIVE;
 	cnp.cn_thread = td;
 	cnp.cn_cred = td->td_ucred;
 	cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	cnp.cn_nameptr = cnp.cn_pnbuf;
 	error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN,
 	    (size_t *) &cnp.cn_namelen);
 	if (error) {
 		if (lockparent == UE_GETDIR_LOCKPARENT_DONT) {
 			VOP_UNLOCK(start_dvp, 0);
 		}
 		uma_zfree(namei_zone, cnp.cn_pnbuf);
 		printf("ufs_extattr_lookup: copystr failed\n");
 		return (error);
 	}
 	cnp.cn_namelen--;	/* trim nul termination */
 	vargs.a_gen.a_desc = NULL;
 	vargs.a_dvp = start_dvp;
 	vargs.a_vpp = &target_vp;
 	vargs.a_cnp = &cnp;
 	error = ufs_lookup(&vargs);
 	uma_zfree(namei_zone, cnp.cn_pnbuf);
 	if (error) {
 		/*
 		 * Error condition, may have to release the lock on the parent
 		 * if ufs_lookup() didn't.
 		 */
 		if (lockparent == UE_GETDIR_LOCKPARENT_DONT)
 			VOP_UNLOCK(start_dvp, 0);
 
 		/*
 		 * Check that ufs_lookup() didn't release the lock when we
 		 * didn't want it to.
 		 */
 		if (lockparent == UE_GETDIR_LOCKPARENT)
 			ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup");
 
 		return (error);
 	}
 /*
 	if (target_vp == start_dvp)
 		panic("ufs_extattr_lookup: target_vp == start_dvp");
 */
 
 	if (target_vp != start_dvp && lockparent == UE_GETDIR_LOCKPARENT_DONT)
 		VOP_UNLOCK(start_dvp, 0);
 
 	if (lockparent == UE_GETDIR_LOCKPARENT)
 		ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup");
 
 	/* printf("ufs_extattr_lookup: success\n"); */
 	*vp = target_vp;
 	return (0);
 }
 #endif /* !UFS_EXTATTR_AUTOSTART */
 
 /*
  * Enable an EA using the passed filesystem, backing vnode, attribute name,
  * namespace, and proc.  Will perform a VOP_OPEN() on the vp, so expects vp
  * to be locked when passed in.  The vnode will be returned unlocked,
  * regardless of success/failure of the function.  As a result, the caller
  * will always need to vrele(), but not vput().
  */
 static int
 ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
     int attrnamespace, const char *attrname, struct thread *td)
 {
 	int error;
 
 	error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL);
 	if (error) {
 		printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
 		    "with %d\n", error);
 		VOP_UNLOCK(vp, 0);
 		return (error);
 	}
 
 	VOP_ADD_WRITECOUNT(vp, 1);
 	CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp,
 	    vp->v_writecount);
 
 	vref(vp);
 
 	VOP_UNLOCK(vp, 0);
 
 	error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td);
 	if (error != 0)
 		vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 	return (error);
 }
 
 #ifdef UFS_EXTATTR_AUTOSTART
 /*
  * Given a locked directory vnode, iterate over the names in the directory
  * and use ufs_extattr_lookup() to retrieve locked vnodes of potential
  * attribute files.  Then invoke ufs_extattr_enable_with_open() on each
  * to attempt to start the attribute.  Leaves the directory locked on
  * exit.
  */
 static int
 ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
     int attrnamespace, struct thread *td)
 {
 	struct vop_readdir_args vargs;
 	struct dirent *dp, *edp;
 	struct vnode *attr_vp;
 	struct uio auio;
 	struct iovec aiov;
 	char *dirbuf;
 	int error, eofflag = 0;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 
 	vargs.a_gen.a_desc = NULL;
 	vargs.a_vp = dvp;
 	vargs.a_uio = &auio;
 	vargs.a_cred = td->td_ucred;
 	vargs.a_eofflag = &eofflag;
 	vargs.a_ncookies = NULL;
 	vargs.a_cookies = NULL;
 
 	while (!eofflag) {
 		auio.uio_resid = DIRBLKSIZ;
 		aiov.iov_base = dirbuf;
 		aiov.iov_len = DIRBLKSIZ;
 		error = ufs_readdir(&vargs);
 		if (error) {
 			printf("ufs_extattr_iterate_directory: ufs_readdir "
 			    "%d\n", error);
 			return (error);
 		}
 
 		edp = (struct dirent *)&dirbuf[DIRBLKSIZ - auio.uio_resid];
 		for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 			if (dp->d_reclen == 0)
 				break;
 			error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT,
 			    dp->d_name, &attr_vp, td);
 			if (error) {
 				printf("ufs_extattr_iterate_directory: lookup "
 				    "%s %d\n", dp->d_name, error);
 			} else if (attr_vp == dvp) {
 				vrele(attr_vp);
 			} else if (attr_vp->v_type != VREG) {
 				vput(attr_vp);
 			} else {
 				error = ufs_extattr_enable_with_open(ump,
 				    attr_vp, attrnamespace, dp->d_name, td);
 				vrele(attr_vp);
 				if (error) {
 					printf("ufs_extattr_iterate_directory: "
 					    "enable %s %d\n", dp->d_name,
 					    error);
 				} else if (bootverbose) {
 					printf("UFS autostarted EA %s\n",
 					    dp->d_name);
 				}
 			}
 			dp = (struct dirent *) ((char *)dp + dp->d_reclen);
 			if (dp >= edp)
 				break;
 		}
 	}
 	free(dirbuf, M_TEMP);
 	
 	return (0);
 }
 
 /*
  * Auto-start of extended attributes, to be executed (optionally) at
  * mount-time.
  */
 int
 ufs_extattr_autostart(struct mount *mp, struct thread *td)
 {
 	struct ufsmount *ump;
 	int error;
 
 	ump = VFSTOUFS(mp);
 	ufs_extattr_uepm_lock(ump);
 	error = ufs_extattr_autostart_locked(mp, td);
 	ufs_extattr_uepm_unlock(ump);
 	return (error);
 }
 
 static int
 ufs_extattr_autostart_locked(struct mount *mp, struct thread *td)
 {
 	struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int error;
 
 	/*
 	 * UFS_EXTATTR applies only to UFS1, as UFS2 uses native extended
 	 * attributes, so don't autostart.
 	 */
 	if (ump->um_fstype != UFS1)
 		return (0);
 
 	/*
 	 * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
 	 * If so, automatically start EA's.
 	 */
 	error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp);
 	if (error) {
 		printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n",
 		    error);
 		return (error);
 	}
 
 	error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT,
 	    UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td);
 	if (error) {
 		/* rvp ref'd but now unlocked */
 		vrele(rvp);
 		return (error);
 	}
 	if (rvp == attr_dvp) {
 		/* Should never happen. */
 		vput(rvp);
 		vrele(attr_dvp);
 		return (EINVAL);
 	}
 	vrele(rvp);
 
 	if (attr_dvp->v_type != VDIR) {
 		printf("ufs_extattr_autostart: %s != VDIR\n",
 		    UFS_EXTATTR_FSROOTSUBDIR);
 		goto return_vput_attr_dvp;
 	}
 
 	error = ufs_extattr_start_locked(ump, td);
 	if (error) {
 		printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n",
 		    error);
 		goto return_vput_attr_dvp;
 	}
 
 	/*
 	 * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
 	 * UFS_EXTATTR_SUBDIR_USER.  For each, iterate over the sub-directory,
 	 * and start with appropriate type.  Failures in either don't
 	 * result in an over-all failure.  attr_dvp is left locked to
 	 * be cleaned up on exit.
 	 */
 	error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT,
 	    UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td);
 	if (!error) {
 		error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
 		    attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td);
 		if (error)
 			printf("ufs_extattr_iterate_directory returned %d\n",
 			    error);
 		vput(attr_system_dvp);
 	}
 
 	error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT,
 	    UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td);
 	if (!error) {
 		error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
 		    attr_user_dvp, EXTATTR_NAMESPACE_USER, td);
 		if (error)
 			printf("ufs_extattr_iterate_directory returned %d\n",
 			    error);
 		vput(attr_user_dvp);
 	}
 
 	/* Mask startup failures in sub-directories. */
 	error = 0;
 
 return_vput_attr_dvp:
 	vput(attr_dvp);
 
 	return (error);
 }
 #endif /* !UFS_EXTATTR_AUTOSTART */
 
 /*
  * Stop extended attribute support on an FS.
  */
 int
 ufs_extattr_stop(struct mount *mp, struct thread *td)
 {
 	struct ufs_extattr_list_entry *uele;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int error = 0;
 
 	ufs_extattr_uepm_lock(ump);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
 		error = EOPNOTSUPP;
 		goto unlock;
 	}
 
 	while ((uele = LIST_FIRST(&ump->um_extattr.uepm_list)) != NULL) {
 		ufs_extattr_disable(ump, uele->uele_attrnamespace,
 		    uele->uele_attrname, td);
 	}
 
 	ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
 
 	crfree(ump->um_extattr.uepm_ucred);
 	ump->um_extattr.uepm_ucred = NULL;
 
 unlock:
 	ufs_extattr_uepm_unlock(ump);
 
 	return (error);
 }
 
 /*
  * Enable a named attribute on the specified filesystem; provide an
  * unlocked backing vnode to hold the attribute data.
  */
 static int
 ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
     const char *attrname, struct vnode *backing_vnode, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct iovec aiov;
 	struct uio auio;
 	int error = 0;
 
 	if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
 		return (EINVAL);
 	if (backing_vnode->v_type != VREG)
 		return (EINVAL);
 
 	attribute = malloc(sizeof(struct ufs_extattr_list_entry),
 	    M_UFS_EXTATTR, M_WAITOK);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
 		error = EOPNOTSUPP;
 		goto free_exit;
 	}
 
 	if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
 		error = EEXIST;
 		goto free_exit;
 	}
 
 	strncpy(attribute->uele_attrname, attrname,
 	    UFS_EXTATTR_MAXEXTATTRNAME);
 	attribute->uele_attrnamespace = attrnamespace;
 	bzero(&attribute->uele_fileheader,
 	    sizeof(struct ufs_extattr_fileheader));
 	
 	attribute->uele_backing_vnode = backing_vnode;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (caddr_t) &attribute->uele_fileheader;
 	aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
 	auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
 	auio.uio_offset = (off_t) 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 
 	vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
 	error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
 	    ump->um_extattr.uepm_ucred);
 
 	if (error)
 		goto unlock_free_exit;
 
 	if (auio.uio_resid != 0) {
 		printf("ufs_extattr_enable: malformed attribute header\n");
 		error = EINVAL;
 		goto unlock_free_exit;
 	}
 
 	if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
 		printf("ufs_extattr_enable: invalid attribute header magic\n");
 		error = EINVAL;
 		goto unlock_free_exit;
 	}
 
 	if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
 		printf("ufs_extattr_enable: incorrect attribute header "
 		    "version\n");
 		error = EINVAL;
 		goto unlock_free_exit;
 	}
 
 	ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable");
 	LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute,
 	    uele_entries);
 
 	VOP_UNLOCK(backing_vnode, 0);
 	return (0);
 
 unlock_free_exit:
 	VOP_UNLOCK(backing_vnode, 0);
 
 free_exit:
 	free(attribute, M_UFS_EXTATTR);
 	return (error);
 }
 
 /*
  * Disable extended attribute support on an FS.
  */
 static int
 ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct ufs_extattr_list_entry *uele;
 	int error = 0;
 
 	if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
 		return (EINVAL);
 
 	uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
 	if (!uele)
 		return (ENOATTR);
 
 	LIST_REMOVE(uele, uele_entries);
 
 	vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
 	ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable");
 	VOP_UNLOCK(uele->uele_backing_vnode, 0);
 	error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE,
 	    td->td_ucred, td);
 
 	free(uele, M_UFS_EXTATTR);
 
 	return (error);
 }
 
 /*
  * VFS call to manage extended attributes in UFS.  If filename_vp is
  * non-NULL, it must be passed in locked, and regardless of errors in
  * processing, will be unlocked.
  */
 int
 ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
     int attrnamespace, const char *attrname)
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct thread *td = curthread;
 	int error;
 
 	/*
 	 * Processes with privilege, but in jail, are not allowed to
 	 * configure extended attributes.
 	 */
 	error = priv_check(td, PRIV_UFS_EXTATTRCTL);
 	if (error) {
 		if (filename_vp != NULL)
 			VOP_UNLOCK(filename_vp, 0);
 		return (error);
 	}
 
 	/*
 	 * We only allow extattrctl(2) on UFS1 file systems, as UFS2 uses
 	 * native extended attributes.
 	 */
 	if (ump->um_fstype != UFS1) {
 		if (filename_vp != NULL)
 			VOP_UNLOCK(filename_vp, 0);
 		return (EOPNOTSUPP);
 	}
 
 	switch(cmd) {
 	case UFS_EXTATTR_CMD_START:
 		if (filename_vp != NULL) {
 			VOP_UNLOCK(filename_vp, 0);
 			return (EINVAL);
 		}
 		if (attrname != NULL)
 			return (EINVAL);
 
 		error = ufs_extattr_start(mp, td);
 
 		return (error);
 		
 	case UFS_EXTATTR_CMD_STOP:
 		if (filename_vp != NULL) {
 			VOP_UNLOCK(filename_vp, 0);
 			return (EINVAL);
 		}
 		if (attrname != NULL)
 			return (EINVAL);
 
 		error = ufs_extattr_stop(mp, td);
 
 		return (error);
 
 	case UFS_EXTATTR_CMD_ENABLE:
 
 		if (filename_vp == NULL)
 			return (EINVAL);
 		if (attrname == NULL) {
 			VOP_UNLOCK(filename_vp, 0);
 			return (EINVAL);
 		}
 
 		/*
 		 * ufs_extattr_enable_with_open() will always unlock the
 		 * vnode, regardless of failure.
 		 */
 		ufs_extattr_uepm_lock(ump);
 		error = ufs_extattr_enable_with_open(ump, filename_vp,
 		    attrnamespace, attrname, td);
 		ufs_extattr_uepm_unlock(ump);
 
 		return (error);
 
 	case UFS_EXTATTR_CMD_DISABLE:
 
 		if (filename_vp != NULL) {
 			VOP_UNLOCK(filename_vp, 0);
 			return (EINVAL);
 		}
 		if (attrname == NULL)
 			return (EINVAL);
 
 		ufs_extattr_uepm_lock(ump);
 		error = ufs_extattr_disable(ump, attrnamespace, attrname,
 		    td);
 		ufs_extattr_uepm_unlock(ump);
 
 		return (error);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 int
 ufs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct mount *mp = ap->a_vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int error;
 
 	ufs_extattr_uepm_lock(ump);
 
 	error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_uio, ap->a_size, ap->a_cred, ap->a_td);
 
 	ufs_extattr_uepm_unlock(ump);
 
 	return (error);
 }
 
 /*
  * Real work associated with retrieving a named attribute--assumes that
  * the attribute lock has already been grabbed.
  */
 static int
 ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
     struct uio *uio, size_t *size, struct ucred *cred, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct ufs_extattr_header ueh;
 	struct iovec local_aiov;
 	struct uio local_aio;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *ip = VTOI(vp);
 	off_t base_offset;
 	size_t len, old_len;
 	int error = 0;
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		return (EOPNOTSUPP);
 
 	if (strlen(name) == 0)
 		return (EINVAL);
 
 	error = extattr_check_cred(vp, attrnamespace, cred, td, VREAD);
 	if (error)
 		return (error);
 
 	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
 	if (!attribute)
 		return (ENOATTR);
 
 	/*
 	 * Allow only offsets of zero to encourage the read/replace
 	 * extended attribute semantic.  Otherwise we can't guarantee
 	 * atomicity, as we don't provide locks for extended attributes.
 	 */
 	if (uio != NULL && uio->uio_offset != 0)
 		return (ENXIO);
 
 	/*
 	 * Find base offset of header in file based on file header size, and
 	 * data header size + maximum data size, indexed by inode number.
 	 */
 	base_offset = sizeof(struct ufs_extattr_fileheader) +
 	    ip->i_number * (sizeof(struct ufs_extattr_header) +
 	    attribute->uele_fileheader.uef_size);
 
 	/*
 	 * Read in the data header to see if the data is defined, and if so
 	 * how much.
 	 */
 	bzero(&ueh, sizeof(struct ufs_extattr_header));
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_READ;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 	
 	/*
 	 * Acquire locks.
 	 *
 	 * Don't need to get a lock on the backing file if the getattr is
 	 * being applied to the backing file, as the lock is already held.
 	 */
 	if (attribute->uele_backing_vnode != vp)
 		vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
 
 	error = VOP_READ(attribute->uele_backing_vnode, &local_aio,
 	    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	/* Defined? */
 	if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) {
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Valid for the current inode generation? */
 	if (ueh.ueh_i_gen != ip->i_gen) {
 		/*
 		 * The inode itself has a different generation number
 		 * than the attribute data.  For now, the best solution
 		 * is to coerce this to undefined, and let it get cleaned
 		 * up by the next write or extattrctl clean.
 		 */
 		printf("ufs_extattr_get (%s): inode number inconsistency (%d, %ju)\n",
 		    mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (uintmax_t)ip->i_gen);
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Local size consistency check. */
 	if (ueh.ueh_len > attribute->uele_fileheader.uef_size) {
 		error = ENXIO;
 		goto vopunlock_exit;
 	}
 
 	/* Return full data size if caller requested it. */
 	if (size != NULL)
 		*size = ueh.ueh_len;
 
 	/* Return data if the caller requested it. */
 	if (uio != NULL) {
 		/* Allow for offset into the attribute data. */
 		uio->uio_offset = base_offset + sizeof(struct
 		    ufs_extattr_header);
 
 		/*
 		 * Figure out maximum to transfer -- use buffer size and
 		 * local data limit.
 		 */
 		len = MIN(uio->uio_resid, ueh.ueh_len);
 		old_len = uio->uio_resid;
 		uio->uio_resid = len;
 
 		error = VOP_READ(attribute->uele_backing_vnode, uio,
 		    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
 		if (error)
 			goto vopunlock_exit;
 
 		uio->uio_resid = old_len - (len - uio->uio_resid);
 	}
 
 vopunlock_exit:
 
 	if (uio != NULL)
 		uio->uio_offset = 0;
 
 	if (attribute->uele_backing_vnode != vp)
 		VOP_UNLOCK(attribute->uele_backing_vnode, 0);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 ufs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct mount *mp = ap->a_vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp); 
 	int error;
 
 	ufs_extattr_uepm_lock(ump);
 
 	error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_cred, ap->a_td);
 
 
 	ufs_extattr_uepm_unlock(ump);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 int
 ufs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct mount *mp = ap->a_vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp); 
 	int error;
 
 	/*
 	 * XXX: No longer a supported way to delete extended attributes.
 	 */
 	if (ap->a_uio == NULL)
 		return (EINVAL);
 
 	ufs_extattr_uepm_lock(ump);
 
 	error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_uio, ap->a_cred, ap->a_td);
 
 	ufs_extattr_uepm_unlock(ump);
 
 	return (error);
 }
 
 /*
  * Real work associated with setting a vnode's extended attributes;
  * assumes that the attribute lock has already been grabbed.
  */
 static int
 ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
     struct uio *uio, struct ucred *cred, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct ufs_extattr_header ueh;
 	struct iovec local_aiov;
 	struct uio local_aio;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *ip = VTOI(vp);
 	off_t base_offset;
 	int error = 0, ioflag;
 
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		return (EOPNOTSUPP);
 	if (!ufs_extattr_valid_attrname(attrnamespace, name))
 		return (EINVAL);
 
 	error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE);
 	if (error)
 		return (error);
 
 	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
 	if (!attribute)
 		return (ENOATTR);
 
 	/*
 	 * Early rejection of invalid offsets/length.
 	 * Reject: any offset but 0 (replace)
 	 *	 Any size greater than attribute size limit
  	 */
 	if (uio->uio_offset != 0 ||
 	    uio->uio_resid > attribute->uele_fileheader.uef_size)
 		return (ENXIO);
 
 	/*
 	 * Find base offset of header in file based on file header size, and
 	 * data header size + maximum data size, indexed by inode number.
 	 */
 	base_offset = sizeof(struct ufs_extattr_fileheader) +
 	    ip->i_number * (sizeof(struct ufs_extattr_header) +
 	    attribute->uele_fileheader.uef_size);
 
 	/*
 	 * Write out a data header for the data.
 	 */
 	ueh.ueh_len = uio->uio_resid;
 	ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE;
 	ueh.ueh_i_gen = ip->i_gen;
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_WRITE;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 
 	/*
 	 * Acquire locks.
 	 *
 	 * Don't need to get a lock on the backing file if the setattr is
 	 * being applied to the backing file, as the lock is already held.
 	 */
 	if (attribute->uele_backing_vnode != vp)
 		vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
 
 	ioflag = IO_NODELOCKED;
 	if (ufs_extattr_sync)
 		ioflag |= IO_SYNC;
 	error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
 	    ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	if (local_aio.uio_resid != 0) {
 		error = ENXIO;
 		goto vopunlock_exit;
 	}
 
 	/*
 	 * Write out user data.
 	 */
 	uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
 
 	ioflag = IO_NODELOCKED;
 	if (ufs_extattr_sync)
 		ioflag |= IO_SYNC;
 	error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
 	    ump->um_extattr.uepm_ucred);
 
 vopunlock_exit:
 	uio->uio_offset = 0;
 
 	if (attribute->uele_backing_vnode != vp)
 		VOP_UNLOCK(attribute->uele_backing_vnode, 0);
 
 	return (error);
 }
 
 /*
  * Real work associated with removing an extended attribute from a vnode.
  * Assumes the attribute lock has already been grabbed.
  */
 static int
 ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
     struct ucred *cred, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct ufs_extattr_header ueh;
 	struct iovec local_aiov;
 	struct uio local_aio;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *ip = VTOI(vp);
 	off_t base_offset;
 	int error = 0, ioflag;
 
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)  
 		return (EROFS);
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		return (EOPNOTSUPP);
 	if (!ufs_extattr_valid_attrname(attrnamespace, name))
 		return (EINVAL);
 
 	error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE);
 	if (error)
 		return (error);
 
 	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
 	if (!attribute)
 		return (ENOATTR);
 
 	/*
 	 * Find base offset of header in file based on file header size, and
 	 * data header size + maximum data size, indexed by inode number.
 	 */
 	base_offset = sizeof(struct ufs_extattr_fileheader) +
 	    ip->i_number * (sizeof(struct ufs_extattr_header) +
 	    attribute->uele_fileheader.uef_size);
 
 	/*
 	 * Check to see if currently defined.
 	 */
 	bzero(&ueh, sizeof(struct ufs_extattr_header));
 
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_READ;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 
 	/*
 	 * Don't need to get the lock on the backing vnode if the vnode we're
 	 * modifying is it, as we already hold the lock.
 	 */
 	if (attribute->uele_backing_vnode != vp)
 		vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
 
 	error = VOP_READ(attribute->uele_backing_vnode, &local_aio,
 	    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	/* Defined? */
 	if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) {
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Valid for the current inode generation? */
 	if (ueh.ueh_i_gen != ip->i_gen) {
 		/*
 		 * The inode itself has a different generation number than
 		 * the attribute data.  For now, the best solution is to
 		 * coerce this to undefined, and let it get cleaned up by
 		 * the next write or extattrctl clean.
 		 */
 		printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n",
 		    mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen);
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Flag it as not in use. */
 	ueh.ueh_flags = 0;
 	ueh.ueh_len = 0;
 
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_WRITE;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 
 	ioflag = IO_NODELOCKED;
 	if (ufs_extattr_sync)
 		ioflag |= IO_SYNC;
 	error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
 	    ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	if (local_aio.uio_resid != 0)
 		error = ENXIO;
 
 vopunlock_exit:
 	VOP_UNLOCK(attribute->uele_backing_vnode, 0);
 
 	return (error);
 }
 
 /*
  * Called by UFS when an inode is no longer active and should have its
  * attributes stripped.
  */
 void
 ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td)
 {
 	struct ufs_extattr_list_entry *uele;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 
 	/*
 	 * In that case, we cannot lock. We should not have any active vnodes
 	 * on the fs if this is not yet initialized but is going to be, so
 	 * this can go unlocked.
 	 */
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
 		return;
 
 	ufs_extattr_uepm_lock(ump);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
 		ufs_extattr_uepm_unlock(ump);
 		return;
 	}
 
 	LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
 		ufs_extattr_rm(vp, uele->uele_attrnamespace,
 		    uele->uele_attrname, NULL, td);
 
 	ufs_extattr_uepm_unlock(ump);
 }
 
 #endif /* !UFS_EXTATTR */
Index: user/ngie/bsnmp_cleanup/usr.bin/xlint/lint1/decl.c
===================================================================
--- user/ngie/bsnmp_cleanup/usr.bin/xlint/lint1/decl.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/usr.bin/xlint/lint1/decl.c	(revision 298468)
@@ -1,3052 +1,3052 @@
 /* $NetBSD: decl.c,v 1.33 2004/06/20 22:20:16 jmc Exp $ */
 
 /*
  * Copyright (c) 1996 Christopher G. Demetriou.  All Rights Reserved.
  * Copyright (c) 1994, 1995 Jochen Pohl
  * All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Jochen Pohl for
  *	The NetBSD Project.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #if defined(__RCSID) && !defined(lint)
 __RCSID("$NetBSD: decl.c,v 1.33 2004/06/20 22:20:16 jmc Exp $");
 #endif
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "lint1.h"
 
 const	char *unnamed = "<unnamed>";
 
 /* shared type structures for arithmtic types and void */
 static	type_t	*typetab;
 
 /* value of next enumerator during declaration of enum types */
 int	enumval;
 
 /*
  * pointer to top element of a stack which contains informations local
  * to nested declarations
  */
 dinfo_t	*dcs;
 
 static	type_t	*tdeferr(type_t *, tspec_t);
 static	void	settdsym(type_t *, sym_t *);
 static	tspec_t	mrgtspec(tspec_t, tspec_t);
 static	void	align(int, int);
 static	sym_t	*newtag(sym_t *, scl_t, int, int);
 static	int	eqargs(type_t *, type_t *, int *);
 static	int	mnoarg(type_t *, int *);
 static	int	chkosdef(sym_t *, sym_t *);
 static	int	chkptdecl(sym_t *, sym_t *);
 static	sym_t	*nsfunc(sym_t *, sym_t *);
 static	void	osfunc(sym_t *, sym_t *);
 static	void	ledecl(sym_t *);
 static	int	chkinit(sym_t *);
 static	void	chkausg(int, sym_t *);
 static	void	chkvusg(int, sym_t *);
 static	void	chklusg(sym_t *);
 static	void	chktusg(sym_t *);
 static	void	chkglvar(sym_t *);
 static	void	glchksz(sym_t *);
 
 /*
  * initializes all global vars used in declarations
  */
 void
 initdecl(void)
 {
 	int i;
 
 	/* declaration stack */
 	if ((dcs = calloc(1, sizeof (dinfo_t))) == NULL)
 		nomem();
 	dcs->d_ctx = EXTERN;
 	dcs->d_ldlsym = &dcs->d_dlsyms;
 
 	/* type information and classification */
 	inittyp();
 
 	/* shared type structures */
 	if ((typetab = calloc(NTSPEC, sizeof (type_t))) == NULL)
 		nomem();
 	for (i = 0; i < NTSPEC; i++)
 		typetab[i].t_tspec = NOTSPEC;
 	typetab[CHAR].t_tspec = CHAR;
 	typetab[SCHAR].t_tspec = SCHAR;
 	typetab[UCHAR].t_tspec = UCHAR;
 	typetab[SHORT].t_tspec = SHORT;
 	typetab[USHORT].t_tspec = USHORT;
 	typetab[INT].t_tspec = INT;
 	typetab[UINT].t_tspec = UINT;
 	typetab[LONG].t_tspec = LONG;
 	typetab[ULONG].t_tspec = ULONG;
 	typetab[QUAD].t_tspec = QUAD;
 	typetab[UQUAD].t_tspec = UQUAD;
 	typetab[FLOAT].t_tspec = FLOAT;
 	typetab[DOUBLE].t_tspec = DOUBLE;
 	typetab[LDOUBLE].t_tspec = LDOUBLE;
 	typetab[VOID].t_tspec = VOID;
 	/*
 	 * Next two are not real types. They are only used by the parser
 	 * to return keywords "signed" and "unsigned"
 	 */
 	typetab[SIGNED].t_tspec = SIGNED;
 	typetab[UNSIGN].t_tspec = UNSIGN;
 }
 
 /*
  * Returns a shared type structure vor arithmetic types and void.
  *
  * It's important to duplicate this structure (using duptyp() or tdupdyp())
  * if it is to be modified (adding qualifiers or anything else).
  */
 type_t *
 gettyp(tspec_t t)
 {
 
 	return (&typetab[t]);
 }
 
 type_t *
 duptyp(const type_t *tp)
 {
 	type_t	*ntp;
 
 	ntp = getblk(sizeof (type_t));
 	STRUCT_ASSIGN(*ntp, *tp);
 	return (ntp);
 }
 
 /*
  * Use tduptyp() instead of duptyp() inside expressions (if the
  * allocated memory should be freed after the expr).
  */
 type_t *
 tduptyp(const type_t *tp)
 {
 	type_t	*ntp;
 
 	ntp = tgetblk(sizeof (type_t));
 	STRUCT_ASSIGN(*ntp, *tp);
 	return (ntp);
 }
 
 /*
  * Returns 1 if the argument is void or an incomplete array,
  * struct, union or enum type.
  */
 int
 incompl(type_t *tp)
 {
 	tspec_t	t;
 
 	if ((t = tp->t_tspec) == VOID) {
 		return (1);
 	} else if (t == ARRAY) {
 		return (tp->t_aincompl);
 	} else if (t == STRUCT || t == UNION) {
 		return (tp->t_str->sincompl);
 	} else if (t == ENUM) {
 		return (tp->t_enum->eincompl);
 	}
 	return (0);
 }
 
 /*
  * Set the flag for (in)complete array, struct, union or enum
  * types.
  */
 void
 setcompl(type_t *tp, int ic)
 {
 	tspec_t	t;
 
 	if ((t = tp->t_tspec) == ARRAY) {
 		tp->t_aincompl = ic;
 	} else if (t == STRUCT || t == UNION) {
 		tp->t_str->sincompl = ic;
 	} else {
 		if (t != ENUM)
 			LERROR("setcompl()");
 		tp->t_enum->eincompl = ic;
 	}
 }
 
 /*
  * Remember the storage class of the current declaration in dcs->d_scl
  * (the top element of the declaration stack) and detect multiple
  * storage classes.
  */
 void
 addscl(scl_t sc)
 {
 
 	if (sc == INLINE) {
 		if (dcs->d_inline)
 			/* duplicate '%s' */
 			warning(10, "inline");
 		dcs->d_inline = 1;
 		return;
 	}
 	if (dcs->d_type != NULL || dcs->d_atyp != NOTSPEC ||
 	    dcs->d_smod != NOTSPEC || dcs->d_lmod != NOTSPEC) {
 		/* storage class after type is obsolescent */
 		warning(83);
 	}
 	if (dcs->d_scl == NOSCL) {
 		dcs->d_scl = sc;
 	} else {
 		/*
 		 * multiple storage classes. An error will be reported in
 		 * deftyp().
 		 */
 		dcs->d_mscl = 1;
 	}
 }
 
 /*
  * Remember the type, modifier or typedef name returned by the parser
  * in *dcs (top element of decl stack). This information is used in
  * deftyp() to build the type used for all declarators in this
  * declaration.
  *
  * Is tp->t_typedef 1, the type comes from a previously defined typename.
  * Otherwise it comes from a type specifier (int, long, ...) or a
  * struct/union/enum tag.
  */
 void
 addtype(type_t *tp)
 {
 	tspec_t	t;
 
 	if (tp->t_typedef) {
 		if (dcs->d_type != NULL || dcs->d_atyp != NOTSPEC ||
 		    dcs->d_lmod != NOTSPEC || dcs->d_smod != NOTSPEC) {
 			/*
 			 * something like "typedef int a; int a b;"
 			 * This should not happen with current grammar.
 			 */
 			LERROR("addtype()");
 		}
 		dcs->d_type = tp;
 		return;
 	}
 
 	t = tp->t_tspec;
 
 	if (t == STRUCT || t == UNION || t == ENUM) {
 		/*
 		 * something like "int struct a ..."
 		 * struct/union/enum with anything else is not allowed
 		 */
 		if (dcs->d_type != NULL || dcs->d_atyp != NOTSPEC ||
 		    dcs->d_lmod != NOTSPEC || dcs->d_smod != NOTSPEC) {
 			/*
 			 * remember that an error must be reported in
 			 * deftyp().
 			 */
 			dcs->d_terr = 1;
 			dcs->d_atyp = dcs->d_lmod = dcs->d_smod = NOTSPEC;
 		}
 		dcs->d_type = tp;
 		return;
 	}
 
 	if (dcs->d_type != NULL && !dcs->d_type->t_typedef) {
 		/*
 		 * something like "struct a int"
 		 * struct/union/enum with anything else is not allowed
 		 */
 		dcs->d_terr = 1;
 		return;
 	}
 
 	if (t == LONG && dcs->d_lmod == LONG) {
 		/* "long long" or "long ... long" */
 		t = QUAD;
 		dcs->d_lmod = NOTSPEC;
 		if (!quadflg)
 			/* %s C does not support 'long long' */
 			(void)c99ism(265, tflag ? "traditional" : "c89");
 	}
 
 	if (dcs->d_type != NULL && dcs->d_type->t_typedef) {
 		/* something like "typedef int a; a long ..." */
 		dcs->d_type = tdeferr(dcs->d_type, t);
 		return;
 	}
 
 	/* now it can be only a combination of arithmetic types and void */
 	if (t == SIGNED || t == UNSIGN) {
 		/* remember specifiers "signed" and "unsigned" in dcs->d_smod */
 		if (dcs->d_smod != NOTSPEC)
 			/*
 			 * more than one "signed" and/or "unsigned"; print
 			 * an error in deftyp()
 			 */
 			dcs->d_terr = 1;
 		dcs->d_smod = t;
 	} else if (t == SHORT || t == LONG || t == QUAD) {
 		/*
 		 * remember specifiers "short", "long" and "long long" in
 		 * dcs->d_lmod
 		 */
 		if (dcs->d_lmod != NOTSPEC)
 			/* more than one, print error in deftyp() */
 			dcs->d_terr = 1;
 		dcs->d_lmod = t;
 	} else {
 		/*
 		 * remember specifiers "void", "char", "int", "float" or
 		 * "double" int dcs->d_atyp
 		 */
 		if (dcs->d_atyp != NOTSPEC)
 			/* more than one, print error in deftyp() */
 			dcs->d_terr = 1;
 		dcs->d_atyp = t;
 	}
 }
 
 /*
  * called if a list of declaration specifiers contains a typedef name
  * and other specifiers (except struct, union, enum, typedef name)
  */
 static type_t *
 tdeferr(type_t *td, tspec_t t)
 {
 	tspec_t	t2;
 
 	t2 = td->t_tspec;
 
 	switch (t) {
 	case SIGNED:
 	case UNSIGN:
 		if (t2 == CHAR || t2 == SHORT || t2 == INT || t2 == LONG ||
 		    t2 == QUAD) {
 			if (!tflag)
 				/* modifying typedef with ... */
 				warning(5, ttab[t].tt_name);
 			td = duptyp(gettyp(mrgtspec(t2, t)));
 			td->t_typedef = 1;
 			return (td);
 		}
 		break;
 	case SHORT:
 		if (t2 == INT || t2 == UINT) {
 			/* modifying typedef with ... */
 			warning(5, "short");
 			td = duptyp(gettyp(t2 == INT ? SHORT : USHORT));
 			td->t_typedef = 1;
 			return (td);
 		}
 		break;
 	case LONG:
 		if (t2 == INT || t2 == UINT || t2 == LONG || t2 == ULONG ||
 		    t2 == FLOAT || t2 == DOUBLE) {
 			/* modifying typedef with ... */
 			warning(5, "long");
 			if (t2 == INT) {
 				td = gettyp(LONG);
 			} else if (t2 == UINT) {
 				td = gettyp(ULONG);
 			} else if (t2 == LONG) {
 				td = gettyp(QUAD);
 			} else if (t2 == ULONG) {
 				td = gettyp(UQUAD);
 			} else if (t2 == FLOAT) {
 				td = gettyp(DOUBLE);
 			} else if (t2 == DOUBLE) {
 				td = gettyp(LDOUBLE);
 			}
 			td = duptyp(td);
 			td->t_typedef = 1;
 			return (td);
 		}
 		break;
 		/* LINTED (enumeration values not handled in switch) */
 	case NOTSPEC:
 	case USHORT:
 	case UCHAR:
 	case SCHAR:
 	case CHAR:
 	case FUNC:
 	case ARRAY:
 	case PTR:
 	case ENUM:
 	case UNION:
 	case STRUCT:
 	case VOID:
 	case LDOUBLE:
 	case DOUBLE:
 	case FLOAT:
 	case UQUAD:
 	case QUAD:
 	case ULONG:
 	case UINT:
 	case INT:
 		break;
 	}
 
 	/* Anything other is not accepted. */
 
 	dcs->d_terr = 1;
 	return (td);
 }
 
 /*
  * Remember the symbol of a typedef name (2nd arg) in a struct, union
  * or enum tag if the typedef name is the first defined for this tag.
  *
  * If the tag is unnamed, the typdef name is used for identification
  * of this tag in lint2. Although its possible that more than one typedef
  * name is defined for one tag, the first name defined should be unique
  * if the tag is unnamed.
  */
 static void
 settdsym(type_t *tp, sym_t *sym)
 {
 	tspec_t	t;
 
 	if ((t = tp->t_tspec) == STRUCT || t == UNION) {
 		if (tp->t_str->stdef == NULL)
 			tp->t_str->stdef = sym;
 	} else if (t == ENUM) {
 		if (tp->t_enum->etdef == NULL)
 			tp->t_enum->etdef = sym;
 	}
 }
 
 /*
  * Remember a qualifier which is part of the declaration specifiers
  * (and not the declarator) in the top element of the declaration stack.
  * Also detect multiple qualifiers of the same kind.
 
  * The remembered qualifier is used by deftyp() to construct the type
  * for all declarators.
  */
 void
 addqual(tqual_t q)
 {
 
 	if (q == CONST) {
 		if (dcs->d_const) {
 			/* duplicate "%s" */
 			warning(10, "const");
 		}
 		dcs->d_const = 1;
 	} else {
 		if (q != VOLATILE)
 			LERROR("addqual()");
 		if (dcs->d_volatile) {
 			/* duplicate "%s" */
 			warning(10, "volatile");
 		}
 		dcs->d_volatile = 1;
 	}
 }
 
 /*
  * Go to the next declaration level (structs, nested structs, blocks,
  * argument declaration lists ...)
  */
 void
 pushdecl(scl_t sc)
 {
 	dinfo_t	*di;
 
 	if (dflag)
 		(void)printf("pushdecl(%d)\n", (int)sc);
 
 	/* put a new element on the declaration stack */
 	if ((di = calloc(1, sizeof (dinfo_t))) == NULL)
 		nomem();
 	di->d_nxt = dcs;
 	dcs = di;
 	di->d_ctx = sc;
 	di->d_ldlsym = &di->d_dlsyms;
 }
 
 /*
  * Go back to previous declaration level
  */
 void
 popdecl(void)
 {
 	dinfo_t	*di;
 
 	if (dflag)
 		(void)printf("popdecl(%d)\n", (int)dcs->d_ctx);
 
 	if (dcs->d_nxt == NULL)
 		LERROR("popdecl()");
 	di = dcs;
 	dcs = di->d_nxt;
 	switch (di->d_ctx) {
 	case EXTERN:
 		/* there is nothing after external declarations */
 		LERROR("popdecl()");
 		/* NOTREACHED */
 	case MOS:
 	case MOU:
 	case ENUMCON:
 		/*
 		 * Symbols declared in (nested) structs or enums are
 		 * part of the next level (they are removed from the
 		 * symbol table if the symbols of the outher level are
 		 * removed)
 		 */
 		if ((*dcs->d_ldlsym = di->d_dlsyms) != NULL)
 			dcs->d_ldlsym = di->d_ldlsym;
 		break;
 	case ARG:
 		/*
 		 * All symbols in dcs->d_dlsyms are introduced in old style
 		 * argument declarations (it's not clean, but possible).
 		 * They are appended to the list of symbols declared in
 		 * an old style argument identifier list or a new style
 		 * parameter type list.
 		 */
 		if (di->d_dlsyms != NULL) {
 			*di->d_ldlsym = dcs->d_fpsyms;
 			dcs->d_fpsyms = di->d_dlsyms;
 		}
 		break;
 	case ABSTRACT:
 		/*
 		 * casts and sizeof
 		 * Append all symbols declared in the abstract declaration
 		 * to the list of symbols declared in the surrounding decl.
 		 * or block.
 		 * XXX I'm not sure whether they should be removed from the
 		 * symbol table now or later.
 		 */
 		if ((*dcs->d_ldlsym = di->d_dlsyms) != NULL)
 			dcs->d_ldlsym = di->d_ldlsym;
 		break;
 	case AUTO:
 		/* check usage of local vars */
 		chkusage(di);
 		/* FALLTHROUGH */
 	case PARG:
 		/* usage of arguments will be checked by funcend() */
 		rmsyms(di->d_dlsyms);
 		break;
 	default:
 		LERROR("popdecl()");
 	}
 	free(di);
 }
 
 /*
  * Set flag d_asm in all declaration stack elements up to the
  * outermost one.
  *
  * This is used to mark compound statements which have, possibly in
  * nested compound statements, asm statements. For these compound
  * statements no warnings about unused or unitialized variables are
  * printed.
  *
  * There is no need to clear d_asm in dinfo structs with context AUTO,
  * because these structs are freed at the end of the compound statement.
  * But it must be cleard in the outermost dinfo struct, which has
  * context EXTERN. This could be done in clrtyp() and would work for
  * C, but not for C++ (due to mixed statements and declarations). Thus
  * we clear it in glclup(), which is used to do some cleanup after
  * global declarations/definitions.
  */
 void
 setasm(void)
 {
 	dinfo_t	*di;
 
 	for (di = dcs; di != NULL; di = di->d_nxt)
 		di->d_asm = 1;
 }
 
 /*
  * Clean all elements of the top element of declaration stack which
  * will be used by the next declaration
  */
 void
 clrtyp(void)
 {
 
 	dcs->d_atyp = dcs->d_smod = dcs->d_lmod = NOTSPEC;
 	dcs->d_scl = NOSCL;
 	dcs->d_type = NULL;
 	dcs->d_const = dcs->d_volatile = 0;
 	dcs->d_inline = 0;
 	dcs->d_mscl = dcs->d_terr = 0;
 	dcs->d_nedecl = 0;
 	dcs->d_notyp = 0;
 }
 
 /*
  * Create a type structure from the informations gathered in
  * the declaration stack.
  * Complain about storage classes which are not possible in current
  * context.
  */
 void
 deftyp(void)
 {
 	tspec_t	t, s, l;
 	type_t	*tp;
 	scl_t	scl;
 
 	t = dcs->d_atyp;		/* CHAR, INT, FLOAT, DOUBLE, VOID */
 	s = dcs->d_smod;		/* SIGNED, UNSIGNED */
 	l = dcs->d_lmod;		/* SHORT, LONG, QUAD */
 	tp = dcs->d_type;
 	scl = dcs->d_scl;
 
 	if (t == NOTSPEC && s == NOTSPEC && l == NOTSPEC && tp == NULL)
 		dcs->d_notyp = 1;
 
 	if (tp != NULL && (t != NOTSPEC || s != NOTSPEC || l != NOTSPEC)) {
 		/* should never happen */
 		LERROR("deftyp()");
 	}
 
 	if (tp == NULL) {
 		switch (t) {
 		case NOTSPEC:
 			t = INT;
 			/* FALLTHROUGH */
 		case INT:
 			if (s == NOTSPEC)
 				s = SIGNED;
 			break;
 		case CHAR:
 			if (l != NOTSPEC) {
 				dcs->d_terr = 1;
 				l = NOTSPEC;
 			}
 			break;
 		case FLOAT:
 			if (l == LONG) {
 				l = NOTSPEC;
 				t = DOUBLE;
 				if (!tflag)
 					/* use 'double' instead of ...  */
 					warning(6);
 			}
 			break;
 		case DOUBLE:
 			if (l == LONG) {
 				l = NOTSPEC;
 				t = LDOUBLE;
 				if (tflag)
 					/* 'long double' is illegal in ... */
 					warning(266);
 			}
 			break;
 		case VOID:
 			break;
 		default:
 			LERROR("deftyp()");
 		}
 		if (t != INT && t != CHAR && (s != NOTSPEC || l != NOTSPEC)) {
 			dcs->d_terr = 1;
 			l = s = NOTSPEC;
 		}
 		if (l != NOTSPEC)
 			t = l;
 		dcs->d_type = gettyp(mrgtspec(t, s));
 	}
 
 	if (dcs->d_mscl) {
 		/* only one storage class allowed */
 		error(7);
 	}
 	if (dcs->d_terr) {
 		/* illegal type combination */
 		error(4);
 	}
 
 	if (dcs->d_ctx == EXTERN) {
 		if (scl == REG || scl == AUTO) {
 			/* illegal storage class */
 			error(8);
 			scl = NOSCL;
 		}
 	} else if (dcs->d_ctx == ARG || dcs->d_ctx == PARG) {
 		if (scl != NOSCL && scl != REG) {
 			/* only "register" valid ... */
 			error(9);
 			scl = NOSCL;
 		}
 	}
 
 	dcs->d_scl = scl;
 
 	if (dcs->d_const && dcs->d_type->t_const) {
 		if (!dcs->d_type->t_typedef)
 			LERROR("deftyp()");
 		/* typedef already qualified with "%s" */
 		warning(68, "const");
 	}
 	if (dcs->d_volatile && dcs->d_type->t_volatile) {
 		if (!dcs->d_type->t_typedef)
 			LERROR("deftyp()");
 		/* typedef already qualified with "%s" */
 		warning(68, "volatile");
 	}
 
 	if (dcs->d_const || dcs->d_volatile) {
 		dcs->d_type = duptyp(dcs->d_type);
 		dcs->d_type->t_const |= dcs->d_const;
 		dcs->d_type->t_volatile |= dcs->d_volatile;
 	}
 }
 
 /*
  * Merge type specifiers (char, ..., long long, signed, unsigned).
  */
 static tspec_t
 mrgtspec(tspec_t t, tspec_t s)
 {
 
 	if (s == SIGNED || s == UNSIGN) {
 		if (t == CHAR) {
 			t = s == SIGNED ? SCHAR : UCHAR;
 		} else if (t == SHORT) {
 			t = s == SIGNED ? SHORT : USHORT;
 		} else if (t == INT) {
 			t = s == SIGNED ? INT : UINT;
 		} else if (t == LONG) {
 			t = s == SIGNED ? LONG : ULONG;
 		} else if (t == QUAD) {
 			t = s == SIGNED ? QUAD : UQUAD;
 		}
 	}
 
 	return (t);
 }
 
 /*
  * Return the length of a type in bit.
  *
  * Printing a message if the outhermost dimension of an array is 0 must
  * be done by the caller. All other problems are reported by length()
  * if name is not NULL.
  */
 int
 length(type_t *tp, const char *name)
 {
 	int	elem, elsz;
 
 	elem = 1;
 	while (tp && tp->t_tspec == ARRAY) {
 		elem *= tp->t_dim;
 		tp = tp->t_subt;
 	}
 	if (tp == NULL)
 		return -1;
 
 	switch (tp->t_tspec) {
 	case FUNC:
 		/* compiler takes size of function */
 		LERROR("%s", msgs[12]);
 		/* NOTREACHED */
 	case STRUCT:
 	case UNION:
 		if (incompl(tp) && name != NULL) {
 			/* incomplete structure or union %s: %s */
 			error(31, tp->t_str->stag->s_name, name);
 		}
 		elsz = tp->t_str->size;
 		break;
 	case ENUM:
 		if (incompl(tp) && name != NULL) {
 			/* incomplete enum type: %s */
 			warning(13, name);
 		}
 		/* FALLTHROUGH */
 	default:
 		elsz = size(tp->t_tspec);
 		if (elsz <= 0)
 			LERROR("length()");
 		break;
 	}
 	return (elem * elsz);
 }
 
 /*
  * Get the alignment of the given type in bits.
  */
 int
 getbound(type_t *tp)
 {
 	int	a;
 	tspec_t	t;
 
 	while (tp && tp->t_tspec == ARRAY)
 		tp = tp->t_subt;
 
 	if (tp == NULL)
 		return -1;
 
 	if ((t = tp->t_tspec) == STRUCT || t == UNION) {
 		a = tp->t_str->align;
 	} else if (t == FUNC) {
 		/* compiler takes alignment of function */
 		error(14);
 		a = LINT_ALIGN(1) * CHAR_BIT;
 	} else {
 		if ((a = size(t)) == 0) {
 			a = CHAR_BIT;
 		} else if (a > LINT_ALIGN(1) * CHAR_BIT) {
 			a = LINT_ALIGN(1) * CHAR_BIT;
 		}
 	}
 	if (a < CHAR_BIT || a > LINT_ALIGN(1) * CHAR_BIT)
 		LERROR("getbound()");
 	return (a);
 }
 
 /*
  * Concatenate two lists of symbols by s_nxt. Used by declarations of
  * struct/union/enum elements and parameters.
  */
 sym_t *
 lnklst(sym_t *l1, sym_t *l2)
 {
 	sym_t	*l;
 
 	if ((l = l1) == NULL)
 		return (l2);
 	while (l1->s_nxt != NULL)
 		l1 = l1->s_nxt;
 	l1->s_nxt = l2;
 	return (l);
 }
 
 /*
  * Check if the type of the given symbol is valid and print an error
  * message if it is not.
  *
  * Invalid types are:
  * - arrays of incomlete types or functions
  * - functions returning arrays or functions
  * - void types other than type of function or pointer
  */
 void
 chktyp(sym_t *sym)
 {
 	tspec_t	to, t;
 	type_t	**tpp, *tp;
 
 	tpp = &sym->s_type;
 	to = NOTSPEC;
 	while ((tp = *tpp) != NULL) {
 		t = tp->t_tspec;
 		/*
 		 * If this is the type of an old style function definition,
 		 * a better warning is printed in funcdef().
 		 */
 		if (t == FUNC && !tp->t_proto &&
 		    !(to == NOTSPEC && sym->s_osdef)) {
 			if (sflag && hflag)
 				/* function declaration is not a prototype */
 				warning(287);
 		}
 		if (to == FUNC) {
 			if (t == FUNC || t == ARRAY) {
 				/* function returns illegal type */
 				error(15);
 				if (t == FUNC) {
 					*tpp = incref(*tpp, PTR);
 				} else {
 					*tpp = incref((*tpp)->t_subt, PTR);
 				}
 				return;
 			} else if (tp->t_const || tp->t_volatile) {
 				if (sflag) {	/* XXX oder better !tflag ? */
 					/* function cannot return const... */
 					warning(228);
 				}
 			}
 		} if (to == ARRAY) {
 			if (t == FUNC) {
 				/* array of function is illegal */
 				error(16);
 				*tpp = gettyp(INT);
 				return;
 			} else if (t == ARRAY && tp->t_dim == 0) {
 				/* null dimension */
 				error(17);
 				return;
 			} else if (t == VOID) {
 				/* illegal use of void */
 				error(18);
 				*tpp = gettyp(INT);
 #if 0	/* errors are produced by length() */
 			} else if (incompl(tp)) {
 				/* array of incomplete type */
 				if (sflag) {
 					error(301);
 				} else {
 					warning(301);
 				}
 #endif
 			}
 		} else if (to == NOTSPEC && t == VOID) {
 			if (dcs->d_ctx == PARG) {
 				if (sym->s_scl != ABSTRACT) {
 					if (sym->s_name == unnamed)
 						LERROR("chktyp()");
 					/* void param cannot have name: %s */
 					error(61, sym->s_name);
 					*tpp = gettyp(INT);
 				}
 			} else if (dcs->d_ctx == ABSTRACT) {
 				/* ok */
 			} else if (sym->s_scl != TYPEDEF) {
 				/* void type for %s */
 				error(19, sym->s_name);
 				*tpp = gettyp(INT);
 			}
 		}
 		if (t == VOID && to != PTR) {
 			if (tp->t_const || tp->t_volatile) {
 				/* inappropriate qualifiers with "void" */
 				warning(69);
 				tp->t_const = tp->t_volatile = 0;
 			}
 		}
 		tpp = &tp->t_subt;
 		to = t;
 	}
 }
 
 /*
  * Process the declarator of a struct/union element.
  */
 sym_t *
 decl1str(sym_t *dsym)
 {
 	type_t	*tp;
 	tspec_t	t;
 	int	sz, len;
 	int	o = 0;	/* Appease gcc */
 	scl_t	sc;
 
 	if ((sc = dsym->s_scl) != MOS && sc != MOU)
 		LERROR("decl1str()");
 
 	if (dcs->d_rdcsym != NULL) {
 		if ((sc = dcs->d_rdcsym->s_scl) != MOS && sc != MOU)
 			/* should be ensured by storesym() */
 			LERROR("decl1str()");
 		if (dsym->s_styp == dcs->d_rdcsym->s_styp) {
 			/* duplicate member name: %s */
 			error(33, dsym->s_name);
 			rmsym(dcs->d_rdcsym);
 		}
 	}
 
 	chktyp(dsym);
 
 	t = (tp = dsym->s_type)->t_tspec;
 
 	if (dsym->s_field) {
 		/*
 		 * bit field
 		 *
 		 * only unsigned and signed int are portable bit-field types
 		 * (at least in ANSI C, in traditional C only unsigned int)
 		 */
 		if (t == CHAR || t == UCHAR || t == SCHAR ||
 		    t == SHORT || t == USHORT || t == ENUM) {
 			if (bitfieldtype_ok == 0) {
 				if (sflag) {
 					char buf[64];
 					/*
 					 * bit-field type '%s' invalid in
 					 * ANSI C
 					 */
 					warning(273,
 					    tyname(buf, sizeof(buf), tp));
 				} else if (pflag) {
 					/* nonportable bit-field type */
 					warning(34);
 				}
 			}
 		} else if (t == INT && dcs->d_smod == NOTSPEC) {
 			if (pflag && bitfieldtype_ok == 0) {
 				/* nonportable bit-field type */
 				warning(34);
 			}
 		} else if (t != INT && t != UINT) {
 			/*
 			 * Non-integer types are always illegal for
 			 * bitfields, regardless of BITFIELDTYPE.
 			 * Integer types not dealt with above are
 			 * okay only if BITFIELDTYPE is in effect.
 			 */
 			if (bitfieldtype_ok == 0 || isityp(t) == 0) {
 				/* illegal bit-field type */
 				error(35);
 				sz = tp->t_flen;
 				dsym->s_type = tp = duptyp(gettyp(t = INT));
 				if ((tp->t_flen = sz) > size(t))
 					tp->t_flen = size(t);
 			}
 		}
 		if ((len = tp->t_flen) < 0 || len > size(t)) {
 			/* illegal bit-field size */
 			error(36);
 			tp->t_flen = size(t);
 		} else if (len == 0 && dsym->s_name != unnamed) {
 			/* zero size bit-field */
 			error(37);
 			tp->t_flen = size(t);
 		}
 		if (dsym->s_scl == MOU) {
 			/* illegal use of bit-field */
 			error(41);
 			dsym->s_type->t_isfield = 0;
 			dsym->s_field = 0;
 		}
 	} else if (t == FUNC) {
 		/* function illegal in structure or union */
 		error(38);
 		dsym->s_type = tp = incref(tp, t = PTR);
 	}
 
 	/*
 	 * bit-fields of length 0 are not warned about because length()
 	 * does not return the length of the bit-field but the length
 	 * of the type the bit-field is packed in (its ok)
 	 */
 	if ((sz = length(dsym->s_type, dsym->s_name)) == 0) {
 		if (t == ARRAY && dsym->s_type->t_dim == 0) {
 			/* illegal zero sized structure member: %s */
 			c99ism(39, dsym->s_name);
 		}
 	}
 
 	if (dcs->d_ctx == MOU) {
 		o = dcs->d_offset;
 		dcs->d_offset = 0;
 	}
 	if (dsym->s_field) {
 		align(getbound(tp), tp->t_flen);
 		dsym->s_value.v_quad = (dcs->d_offset / size(t)) * size(t);
 		tp->t_foffs = dcs->d_offset - (int)dsym->s_value.v_quad;
 		dcs->d_offset += tp->t_flen;
 	} else {
 		align(getbound(tp), 0);
 		dsym->s_value.v_quad = dcs->d_offset;
 		dcs->d_offset += sz;
 	}
 	if (dcs->d_ctx == MOU) {
 		if (o > dcs->d_offset)
 			dcs->d_offset = o;
 	}
 
 	chkfdef(dsym, 0);
 
 	/*
 	 * Clear the BITFIELDTYPE indicator after processing each
 	 * structure element.
 	 */
 	bitfieldtype_ok = 0;
 
 	return (dsym);
 }
 
 /*
  * Aligns next structure element as required.
  *
  * al contains the required alignment, len the length of a bit-field.
  */
 static void
 align(int al, int len)
 {
 	int	no;
 
 	/*
 	 * The alignment of the current element becomes the alignment of
 	 * the struct/union if it is larger than the current alignment
 	 * of the struct/union.
 	 */
 	if (al > dcs->d_stralign)
 		dcs->d_stralign = al;
 
-	no = (dcs->d_offset + (al - 1)) & ~(al - 1);
+	no = roundup2(dcs->d_offset, al);
 	if (len == 0 || dcs->d_offset + len > no)
 		dcs->d_offset = no;
 }
 
 /*
  * Remember the width of the field in its type structure.
  */
 sym_t *
 bitfield(sym_t *dsym, int len)
 {
 
 	if (dsym == NULL) {
 		dsym = getblk(sizeof (sym_t));
 		dsym->s_name = unnamed;
 		dsym->s_kind = FMOS;
 		dsym->s_scl = MOS;
 		dsym->s_type = gettyp(UINT);
 		dsym->s_blklev = -1;
 	}
 	dsym->s_type = duptyp(dsym->s_type);
 	dsym->s_type->t_isfield = 1;
 	dsym->s_type->t_flen = len;
 	dsym->s_field = 1;
 	return (dsym);
 }
 
 /*
  * Collect informations about a sequence of asterisks and qualifiers
  * in a list of type pqinf_t.
  * Qualifiers refer always to the left asterisk. The rightmost asterisk
  * will be at the top of the list.
  */
 pqinf_t *
 mergepq(pqinf_t *p1, pqinf_t *p2)
 {
 	pqinf_t	*p;
 
 	if (p2->p_pcnt != 0) {
 		/* left '*' at the end of the list */
 		for (p = p2; p->p_nxt != NULL; p = p->p_nxt)
 			continue;
 		p->p_nxt = p1;
 		return (p2);
 	} else {
 		if (p2->p_const) {
 			if (p1->p_const) {
 				/* duplicate %s */
 				warning(10, "const");
 			}
 			p1->p_const = 1;
 		}
 		if (p2->p_volatile) {
 			if (p1->p_volatile) {
 				/* duplicate %s */
 				warning(10, "volatile");
 			}
 			p1->p_volatile = 1;
 		}
 		free(p2);
 		return (p1);
 	}
 }
 
 /*
  * Followint 3 functions extend the type of a declarator with
  * pointer, function and array types.
  *
  * The current type is the type built by deftyp() (dcs->d_type) and
  * pointer, function and array types already added for this
  * declarator. The new type extension is inserted between both.
  */
 sym_t *
 addptr(sym_t *decl, pqinf_t *pi)
 {
 	type_t	**tpp, *tp;
 	pqinf_t	*npi;
 
 	tpp = &decl->s_type;
 	while (*tpp && *tpp != dcs->d_type)
 		tpp = &(*tpp)->t_subt;
 	if (*tpp == NULL)
 		return decl;
 
 	while (pi != NULL) {
 		*tpp = tp = getblk(sizeof (type_t));
 		tp->t_tspec = PTR;
 		tp->t_const = pi->p_const;
 		tp->t_volatile = pi->p_volatile;
 		*(tpp = &tp->t_subt) = dcs->d_type;
 		npi = pi->p_nxt;
 		free(pi);
 		pi = npi;
 	}
 	return (decl);
 }
 
 /*
  * If a dimension was specified, dim is 1, otherwise 0
  * n is the specified dimension
  */
 sym_t *
 addarray(sym_t *decl, int dim, int n)
 {
 	type_t	**tpp, *tp;
 
 	tpp = &decl->s_type;
 	while (*tpp && *tpp != dcs->d_type)
 		tpp = &(*tpp)->t_subt;
 	if (*tpp == NULL)
 	    return decl;
 
 	*tpp = tp = getblk(sizeof (type_t));
 	tp->t_tspec = ARRAY;
 	tp->t_subt = dcs->d_type;
 	tp->t_dim = n;
 
 	if (n < 0) {
 		/* negative array dimension */
 		error(20, n);
 		n = 0;
 	} else if (n == 0 && dim) {
 		/* zero array dimension */
 		c99ism(322, dim);
 	} else if (n == 0 && !dim) {
 		/* is incomplete type */
 		setcompl(tp, 1);
 	}
 
 	return (decl);
 }
 
 sym_t *
 addfunc(sym_t *decl, sym_t *args)
 {
 	type_t	**tpp, *tp;
 
 	if (dcs->d_proto) {
 		if (tflag)
 			/* function prototypes are illegal in traditional C */
 			warning(270);
 		args = nsfunc(decl, args);
 	} else {
 		osfunc(decl, args);
 	}
 
 	/*
 	 * The symbols are removed from the symbol table by popdecl() after
 	 * addfunc(). To be able to restore them if this is a function
 	 * definition, a pointer to the list of all symbols is stored in
 	 * dcs->d_nxt->d_fpsyms. Also a list of the arguments (concatenated
 	 * by s_nxt) is stored in dcs->d_nxt->d_fargs.
 	 * (dcs->d_nxt must be used because *dcs is the declaration stack
 	 * element created for the list of params and is removed after
 	 * addfunc())
 	 */
 	if (dcs->d_nxt->d_ctx == EXTERN &&
 	    decl->s_type == dcs->d_nxt->d_type) {
 		dcs->d_nxt->d_fpsyms = dcs->d_dlsyms;
 		dcs->d_nxt->d_fargs = args;
 	}
 
 	tpp = &decl->s_type;
 	while (*tpp && *tpp != dcs->d_nxt->d_type)
 		tpp = &(*tpp)->t_subt;
 	if (*tpp == NULL)
 	    return decl;
 
 	*tpp = tp = getblk(sizeof (type_t));
 	tp->t_tspec = FUNC;
 	tp->t_subt = dcs->d_nxt->d_type;
 	if ((tp->t_proto = dcs->d_proto) != 0)
 		tp->t_args = args;
 	tp->t_vararg = dcs->d_vararg;
 
 	return (decl);
 }
 
 /*
  * Called for new style function declarations.
  */
 /* ARGSUSED */
 static sym_t *
 nsfunc(sym_t *decl, sym_t *args)
 {
 	sym_t	*arg, *sym;
 	scl_t	sc;
 	int	n;
 
 	/*
 	 * Declarations of structs/unions/enums in param lists are legal,
 	 * but senseless.
 	 */
 	for (sym = dcs->d_dlsyms; sym != NULL; sym = sym->s_dlnxt) {
 		sc = sym->s_scl;
 		if (sc == STRTAG || sc == UNIONTAG || sc == ENUMTAG) {
 			/* dubious tag declaration: %s %s */
 			warning(85, scltoa(sc), sym->s_name);
 		}
 	}
 
 	n = 1;
 	for (arg = args; arg != NULL; arg = arg->s_nxt) {
 		if (arg->s_type->t_tspec == VOID) {
 			if (n > 1 || arg->s_nxt != NULL) {
 				/* "void" must be sole parameter */
 				error(60);
 				arg->s_type = gettyp(INT);
 			}
 		}
 		n++;
 	}
 
 	/* return NULL if first param is VOID */
 	return (args != NULL && args->s_type->t_tspec != VOID ? args : NULL);
 }
 
 /*
  * Called for old style function declarations.
  */
 static void
 osfunc(sym_t *decl, sym_t *args)
 {
 
 	/*
 	 * Remember list of params only if this is really seams to be
 	 * a function definition.
 	 */
 	if (dcs->d_nxt->d_ctx == EXTERN &&
 	    decl->s_type == dcs->d_nxt->d_type) {
 		/*
 		 * We assume that this becomes a function definition. If
 		 * we are wrong, its corrected in chkfdef().
 		 */
 		if (args != NULL) {
 			decl->s_osdef = 1;
 			decl->s_args = args;
 		}
 	} else {
 		if (args != NULL)
 			/* function prototype parameters must have types */
 			warning(62);
 	}
 }
 
 /*
  * Lists of Identifiers in functions declarations are allowed only if
  * its also a function definition. If this is not the case, print a
  * error message.
  */
 void
 chkfdef(sym_t *sym, int msg)
 {
 
 	if (sym->s_osdef) {
 		if (msg) {
 			/* incomplete or misplaced function definition */
 			error(22);
 		}
 		sym->s_osdef = 0;
 		sym->s_args = NULL;
 	}
 }
 
 /*
  * Process the name in a declarator.
  * If the symbol does already exists, a new one is created.
  * The symbol becomes one of the storage classes EXTERN, STATIC, AUTO or
  * TYPEDEF.
  * s_def and s_reg are valid after dname().
  */
 sym_t *
 dname(sym_t *sym)
 {
 	scl_t	sc = NOSCL;
 
 	if (sym->s_scl == NOSCL) {
 		dcs->d_rdcsym = NULL;
 	} else if (sym->s_defarg) {
 		sym->s_defarg = 0;
 		dcs->d_rdcsym = NULL;
 	} else {
 		dcs->d_rdcsym = sym;
 		sym = pushdown(sym);
 	}
 
 	switch (dcs->d_ctx) {
 	case MOS:
 	case MOU:
 		/* Parent setzen */
 		sym->s_styp = dcs->d_tagtyp->t_str;
 		sym->s_def = DEF;
 		sym->s_value.v_tspec = INT;
 		sc = dcs->d_ctx;
 		break;
 	case EXTERN:
 		/*
 		 * static and external symbols without "extern" are
 		 * considered to be tentative defined, external
 		 * symbols with "extern" are declared, and typedef names
 		 * are defined. Tentative defined and declared symbols
 		 * may become defined if an initializer is present or
 		 * this is a function definition.
 		 */
 		if ((sc = dcs->d_scl) == NOSCL) {
 			sc = EXTERN;
 			sym->s_def = TDEF;
 		} else if (sc == STATIC) {
 			sym->s_def = TDEF;
 		} else if (sc == TYPEDEF) {
 			sym->s_def = DEF;
 		} else if (sc == EXTERN) {
 			sym->s_def = DECL;
 		} else {
 			LERROR("dname()");
 		}
 		break;
 	case PARG:
 		sym->s_arg = 1;
 		/* FALLTHROUGH */
 	case ARG:
 		if ((sc = dcs->d_scl) == NOSCL) {
 			sc = AUTO;
 		} else if (sc == REG) {
 			sym->s_reg = 1;
 			sc = AUTO;
 		} else {
 			LERROR("dname()");
 		}
 		sym->s_def = DEF;
 		break;
 	case AUTO:
 		if ((sc = dcs->d_scl) == NOSCL) {
 			/*
 			 * XXX somewhat ugly because we dont know whether
 			 * this is AUTO or EXTERN (functions). If we are
 			 * wrong it must be corrected in decl1loc(), where
 			 * we have the necessary type information.
 			 */
 			sc = AUTO;
 			sym->s_def = DEF;
 		} else if (sc == AUTO || sc == STATIC || sc == TYPEDEF) {
 			sym->s_def = DEF;
 		} else if (sc == REG) {
 			sym->s_reg = 1;
 			sc = AUTO;
 			sym->s_def = DEF;
 		} else if (sc == EXTERN) {
 			sym->s_def = DECL;
 		} else {
 			LERROR("dname()");
 		}
 		break;
 	default:
 		LERROR("dname()");
 	}
 	sym->s_scl = sc;
 
 	sym->s_type = dcs->d_type;
 
 	dcs->d_fpsyms = NULL;
 
 	return (sym);
 }
 
 /*
  * Process a name in the list of formal params in an old style function
  * definition.
  */
 sym_t *
 iname(sym_t *sym)
 {
 
 	if (sym->s_scl != NOSCL) {
 		if (blklev == sym->s_blklev) {
 			/* redeclaration of formal parameter %s */
 			error(21, sym->s_name);
 			if (!sym->s_defarg)
 				LERROR("iname()");
 		}
 		sym = pushdown(sym);
 	}
 	sym->s_type = gettyp(INT);
 	sym->s_scl = AUTO;
 	sym->s_def = DEF;
 	sym->s_defarg = sym->s_arg = 1;
 	return (sym);
 }
 
 /*
  * Create the type of a tag.
  *
  * tag points to the symbol table entry of the tag
  * kind is the kind of the tag (STRUCT/UNION/ENUM)
  * decl is 1 if the type of the tag will be completed in this declaration
  * (the following token is T_LBRACE)
  * semi is 1 if the following token is T_SEMI
  */
 type_t *
 mktag(sym_t *tag, tspec_t kind, int decl, int semi)
 {
 	scl_t	scl = NOSCL;
 	type_t	*tp;
 
 	if (kind == STRUCT) {
 		scl = STRTAG;
 	} else if (kind == UNION) {
 		scl = UNIONTAG;
 	} else if (kind == ENUM) {
 		scl = ENUMTAG;
 	} else {
 		LERROR("mktag()");
 	}
 
 	if (tag != NULL) {
 		if (tag->s_scl != NOSCL) {
 			tag = newtag(tag, scl, decl, semi);
 		} else {
 			/* a new tag, no empty declaration */
 			dcs->d_nxt->d_nedecl = 1;
 			if (scl == ENUMTAG && !decl) {
 				if (!tflag && (sflag || pflag))
 					/* forward reference to enum type */
 					warning(42);
 			}
 		}
 		if (tag->s_scl == NOSCL) {
 			tag->s_scl = scl;
 			tag->s_type = tp = getblk(sizeof (type_t));
 		} else {
 			tp = tag->s_type;
 		}
 	} else {
 		tag = getblk(sizeof (sym_t));
 		tag->s_name = unnamed;
 		UNIQUE_CURR_POS(tag->s_dpos);
 		tag->s_kind = FTAG;
 		tag->s_scl = scl;
 		tag->s_blklev = -1;
 		tag->s_type = tp = getblk(sizeof (type_t));
 		dcs->d_nxt->d_nedecl = 1;
 	}
 
 	if (tp->t_tspec == NOTSPEC) {
 		tp->t_tspec = kind;
 		if (kind != ENUM) {
 			tp->t_str = getblk(sizeof (str_t));
 			tp->t_str->align = CHAR_BIT;
 			tp->t_str->stag = tag;
 		} else {
 			tp->t_isenum = 1;
 			tp->t_enum = getblk(sizeof (enum_t));
 			tp->t_enum->etag = tag;
 		}
 		/* is incomplete type */
 		setcompl(tp, 1);
 	}
 
 	return (tp);
 }
 
 /*
  * Checks all possible cases of tag redeclarations.
  * decl is 1 if T_LBRACE follows
  * semi is 1 if T_SEMI follows
  */
 static sym_t *
 newtag(sym_t *tag, scl_t scl, int decl, int semi)
 {
 
 	if (tag->s_blklev < blklev) {
 		if (semi) {
 			/* "struct a;" */
 			if (!tflag) {
 				if (!sflag)
 					/* decl. introduces new type ... */
 					warning(44, scltoa(scl), tag->s_name);
 				tag = pushdown(tag);
 			} else if (tag->s_scl != scl) {
 				/* base type is really "%s %s" */
 				warning(45, scltoa(tag->s_scl), tag->s_name);
 			}
 			dcs->d_nxt->d_nedecl = 1;
 		} else if (decl) {
 			/* "struct a { ... } " */
 			if (hflag)
 				/* redefinition hides earlier one: %s */
 				warning(43, tag->s_name);
 			tag = pushdown(tag);
 			dcs->d_nxt->d_nedecl = 1;
 		} else if (tag->s_scl != scl) {
 			/* base type is really "%s %s" */
 			warning(45, scltoa(tag->s_scl), tag->s_name);
 			/* declaration introduces new type in ANSI C: %s %s */
 			if (!sflag)
 				warning(44, scltoa(scl), tag->s_name);
 			tag = pushdown(tag);
 			dcs->d_nxt->d_nedecl = 1;
 		}
 	} else {
 		if (tag->s_scl != scl) {
 			/* (%s) tag redeclared */
 			error(46, scltoa(tag->s_scl));
 			prevdecl(-1, tag);
 			tag = pushdown(tag);
 			dcs->d_nxt->d_nedecl = 1;
 		} else if (decl && !incompl(tag->s_type)) {
 			/* (%s) tag redeclared */
 			error(46, scltoa(tag->s_scl));
 			prevdecl(-1, tag);
 			tag = pushdown(tag);
 			dcs->d_nxt->d_nedecl = 1;
 		} else if (semi || decl) {
 			dcs->d_nxt->d_nedecl = 1;
 		}
 	}
 	return (tag);
 }
 
 const char *
 scltoa(scl_t sc)
 {
 	const	char *s;
 
 	switch (sc) {
 	case EXTERN:	s = "extern";	break;
 	case STATIC:	s = "static";	break;
 	case AUTO:	s = "auto";	break;
 	case REG:	s = "register";	break;
 	case TYPEDEF:	s = "typedef";	break;
 	case STRTAG:	s = "struct";	break;
 	case UNIONTAG:	s = "union";	break;
 	case ENUMTAG:	s = "enum";	break;
 	default:	LERROR("tagttoa()");
 	}
 	return (s);
 }
 
 /*
  * Completes the type of a tag in a struct/union/enum declaration.
  * tp points to the type of the, tag, fmem to the list of members/enums.
  */
 type_t *
 compltag(type_t *tp, sym_t *fmem)
 {
 	tspec_t	t;
 	str_t	*sp;
 	int	n;
 	sym_t	*mem;
 
 	/* from now a complete type */
 	setcompl(tp, 0);
 
 	if ((t = tp->t_tspec) != ENUM) {
 		align(dcs->d_stralign, 0);
 		sp = tp->t_str;
 		sp->align = dcs->d_stralign;
 		sp->size = dcs->d_offset;
 		sp->memb = fmem;
 		if (sp->size == 0) {
 			/* zero sized %s */
 			(void)c99ism(47, ttab[t].tt_name);
 		} else {
 			n = 0;
 			for (mem = fmem; mem != NULL; mem = mem->s_nxt) {
 				if (mem->s_name != unnamed)
 					n++;
 			}
 			if (n == 0) {
 				/* %s has no named members */
 				warning(65,
 					t == STRUCT ? "structure" : "union");
 			}
 		}
 	} else {
 		tp->t_enum->elem = fmem;
 	}
 	return (tp);
 }
 
 /*
  * Processes the name of an enumerator in en enum declaration.
  *
  * sym points to the enumerator
  * val is the value of the enumerator
  * impl is 1 if the value of the enumerator was not explicit specified.
  */
 sym_t *
 ename(sym_t *sym, int val, int impl)
 {
 
 	if (sym->s_scl) {
 		if (sym->s_blklev == blklev) {
 			/* no hflag, because this is illegal!!! */
 			if (sym->s_arg) {
 				/* enumeration constant hides parameter: %s */
 				warning(57, sym->s_name);
 			} else {
 				/* redeclaration of %s */
 				error(27, sym->s_name);
 				/*
 				 * inside blocks it should not too complicated
 				 * to find the position of the previous
 				 * declaration
 				 */
 				if (blklev == 0)
 					prevdecl(-1, sym);
 			}
 		} else {
 			if (hflag)
 				/* redefinition hides earlier one: %s */
 				warning(43, sym->s_name);
 		}
 		sym = pushdown(sym);
 	}
 	sym->s_scl = ENUMCON;
 	sym->s_type = dcs->d_tagtyp;
 	sym->s_value.v_tspec = INT;
 	sym->s_value.v_quad = val;
 	if (impl && val - 1 == INT_MAX) {
 		/* overflow in enumeration values: %s */
 		warning(48, sym->s_name);
 	}
 	enumval = val + 1;
 	return (sym);
 }
 
 /*
  * Process a single external declarator.
  */
 void
 decl1ext(sym_t *dsym, int initflg)
 {
 	int	warn, rval, redec;
 	sym_t	*rdsym;
 
 	chkfdef(dsym, 1);
 
 	chktyp(dsym);
 
 	if (initflg && !(initerr = chkinit(dsym)))
 		dsym->s_def = DEF;
 
 	/*
 	 * Declarations of functions are marked as "tentative" in dname().
 	 * This is wrong because there are no tentative function
 	 * definitions.
 	 */
 	if (dsym->s_type->t_tspec == FUNC && dsym->s_def == TDEF)
 		dsym->s_def = DECL;
 
 	if (dcs->d_inline) {
 		if (dsym->s_type->t_tspec == FUNC) {
 			dsym->s_inline = 1;
 		} else {
 			/* variable declared inline: %s */
 			warning(268, dsym->s_name);
 		}
 	}
 
 	/* Write the declaration into the output file */
 	if (plibflg && llibflg &&
 	    dsym->s_type->t_tspec == FUNC && dsym->s_type->t_proto) {
 		/*
 		 * With both LINTLIBRARY and PROTOLIB the prototype is
 		 * written as a function definition to the output file.
 		 */
 		rval = dsym->s_type->t_subt->t_tspec != VOID;
 		outfdef(dsym, &dsym->s_dpos, rval, 0, NULL);
 	} else {
 		outsym(dsym, dsym->s_scl, dsym->s_def);
 	}
 
 	if ((rdsym = dcs->d_rdcsym) != NULL) {
 
 		/*
 		 * If the old symbol stems from an old style function definition
 		 * we have remembered the params in rdsmy->s_args and compare
 		 * them with the params of the prototype.
 		 */
 		if (rdsym->s_osdef && dsym->s_type->t_proto) {
 			redec = chkosdef(rdsym, dsym);
 		} else {
 			redec = 0;
 		}
 
 		if (!redec && !isredec(dsym, (warn = 0, &warn))) {
 
 			if (warn) {
 				/* redeclaration of %s */
 				(*(sflag ? error : warning))(27, dsym->s_name);
 				prevdecl(-1, rdsym);
 			}
 
 			/*
 			 * Overtake the remembered params if the new symbol
 			 * is not a prototype.
 			 */
 			if (rdsym->s_osdef && !dsym->s_type->t_proto) {
 				dsym->s_osdef = rdsym->s_osdef;
 				dsym->s_args = rdsym->s_args;
 				STRUCT_ASSIGN(dsym->s_dpos, rdsym->s_dpos);
 			}
 
 			/*
 			 * Remember the position of the declaration if the
 			 * old symbol was a prototype and the new is not.
 			 * Also remember the position if the old symbol
 			 * was defined and the new is not.
 			 */
 			if (rdsym->s_type->t_proto && !dsym->s_type->t_proto) {
 				STRUCT_ASSIGN(dsym->s_dpos, rdsym->s_dpos);
 			} else if (rdsym->s_def == DEF && dsym->s_def != DEF) {
 				STRUCT_ASSIGN(dsym->s_dpos, rdsym->s_dpos);
 			}
 
 			/*
 			 * Copy informations about usage of the name into
 			 * the new symbol.
 			 */
 			cpuinfo(dsym, rdsym);
 
 			/* Once a name is defined, it remains defined. */
 			if (rdsym->s_def == DEF)
 				dsym->s_def = DEF;
 
 			/* once a function is inline, it remains inline */
 			if (rdsym->s_inline)
 				dsym->s_inline = 1;
 
 			compltyp(dsym, rdsym);
 
 		}
 
 		rmsym(rdsym);
 	}
 
 	if (dsym->s_scl == TYPEDEF) {
 		dsym->s_type = duptyp(dsym->s_type);
 		dsym->s_type->t_typedef = 1;
 		settdsym(dsym->s_type, dsym);
 	}
 
 }
 
 /*
  * Copies informations about usage into a new symbol table entry of
  * the same symbol.
  */
 void
 cpuinfo(sym_t *sym, sym_t *rdsym)
 {
 
 	sym->s_spos = rdsym->s_spos;
 	sym->s_upos = rdsym->s_upos;
 	sym->s_set = rdsym->s_set;
 	sym->s_used = rdsym->s_used;
 }
 
 /*
  * Prints an error and returns 1 if a symbol is redeclared/redefined.
  * Otherwise returns 0 and, in some cases of minor problems, prints
  * a warning.
  */
 int
 isredec(sym_t *dsym, int *warn)
 {
 	sym_t	*rsym;
 
 	if ((rsym = dcs->d_rdcsym)->s_scl == ENUMCON) {
 		/* redeclaration of %s */
 		error(27, dsym->s_name);
 		prevdecl(-1, rsym);
 		return (1);
 	}
 	if (rsym->s_scl == TYPEDEF) {
 		/* typedef redeclared: %s */
 		error(89, dsym->s_name);
 		prevdecl(-1, rsym);
 		return (1);
 	}
 	if (dsym->s_scl == TYPEDEF) {
 		/* redeclaration of %s */
 		error(27, dsym->s_name);
 		prevdecl(-1, rsym);
 		return (1);
 	}
 	if (rsym->s_def == DEF && dsym->s_def == DEF) {
 		/* redefinition of %s */
 		error(28, dsym->s_name);
 		prevdecl(-1, rsym);
 		return(1);
 	}
 	if (!eqtype(rsym->s_type, dsym->s_type, 0, 0, warn)) {
 		/* redeclaration of %s */
 		error(27, dsym->s_name);
 		prevdecl(-1, rsym);
 		return(1);
 	}
 	if (rsym->s_scl == EXTERN && dsym->s_scl == EXTERN)
 		return(0);
 	if (rsym->s_scl == STATIC && dsym->s_scl == STATIC)
 		return(0);
 	if (rsym->s_scl == STATIC && dsym->s_def == DECL)
 		return(0);
 	if (rsym->s_scl == EXTERN && rsym->s_def == DEF) {
 		/*
 		 * All cases except "int a = 1; static int a;" are caught
 		 * above with or without a warning
 		 */
 		/* redeclaration of %s */
 		error(27, dsym->s_name);
 		prevdecl(-1, rsym);
 		return(1);
 	}
 	if (rsym->s_scl == EXTERN) {
 		/* previously declared extern, becomes static: %s */
 		warning(29, dsym->s_name);
 		prevdecl(-1, rsym);
 		return(0);
 	}
 	/*
 	 * Now its on of:
 	 * "static a; int a;", "static a; int a = 1;", "static a = 1; int a;"
 	 */
 	/* redeclaration of %s; ANSI C requires "static" */
 	if (sflag) {
 		warning(30, dsym->s_name);
 		prevdecl(-1, rsym);
 	}
 	dsym->s_scl = STATIC;
 	return (0);
 }
 
 /*
  * Checks if two types are compatible. Returns 0 if not, otherwise 1.
  *
  * ignqual	ignore qualifiers of type; used for function params
  * promot	promote left type; used for comparison of params of
  *		old style function definitions with params of prototypes.
  * *warn	set to 1 if an old style function declaration is not
  *		compatible with a prototype
  */
 int
 eqtype(type_t *tp1, type_t *tp2, int ignqual, int promot, int *warn)
 {
 	tspec_t	t;
 
 	while (tp1 != NULL && tp2 != NULL) {
 
 		t = tp1->t_tspec;
 		if (promot) {
 			if (t == FLOAT) {
 				t = DOUBLE;
 			} else if (t == CHAR || t == SCHAR) {
 				t = INT;
 			} else if (t == UCHAR) {
 				t = tflag ? UINT : INT;
 			} else if (t == SHORT) {
 				t = INT;
 			} else if (t == USHORT) {
 				/* CONSTCOND */
 				t = INT_MAX < USHRT_MAX || tflag ? UINT : INT;
 			}
 		}
 
 		if (t != tp2->t_tspec)
 			return (0);
 
 		if (tp1->t_const != tp2->t_const && !ignqual && !tflag)
 			return (0);
 
 		if (tp1->t_volatile != tp2->t_volatile && !ignqual && !tflag)
 			return (0);
 
 		if (t == STRUCT || t == UNION)
 			return (tp1->t_str == tp2->t_str);
 
 		if (t == ARRAY && tp1->t_dim != tp2->t_dim) {
 			if (tp1->t_dim != 0 && tp2->t_dim != 0)
 				return (0);
 		}
 
 		/* dont check prototypes for traditional */
 		if (t == FUNC && !tflag) {
 			if (tp1->t_proto && tp2->t_proto) {
 				if (!eqargs(tp1, tp2, warn))
 					return (0);
 			} else if (tp1->t_proto) {
 				if (!mnoarg(tp1, warn))
 					return (0);
 			} else if (tp2->t_proto) {
 				if (!mnoarg(tp2, warn))
 					return (0);
 			}
 		}
 
 		tp1 = tp1->t_subt;
 		tp2 = tp2->t_subt;
 		ignqual = promot = 0;
 
 	}
 
 	return (tp1 == tp2);
 }
 
 /*
  * Compares the parameter types of two prototypes.
  */
 static int
 eqargs(type_t *tp1, type_t *tp2, int *warn)
 {
 	sym_t	*a1, *a2;
 
 	if (tp1->t_vararg != tp2->t_vararg)
 		return (0);
 
 	a1 = tp1->t_args;
 	a2 = tp2->t_args;
 
 	while (a1 != NULL && a2 != NULL) {
 
 		if (eqtype(a1->s_type, a2->s_type, 1, 0, warn) == 0)
 			return (0);
 
 		a1 = a1->s_nxt;
 		a2 = a2->s_nxt;
 
 	}
 
 	return (a1 == a2);
 }
 
 /*
  * mnoarg() (matches functions with no argument type information)
  * returns 1 if all parameters of a prototype are compatible with
  * and old style function declaration.
  * This is the case if following conditions are met:
  *	1. the prototype must have a fixed number of parameters
  *	2. no parameter is of type float
  *	3. no parameter is converted to another type if integer promotion
  *	   is applied on it
  */
 static int
 mnoarg(type_t *tp, int *warn)
 {
 	sym_t	*arg;
 	tspec_t	t;
 
 	if (tp->t_vararg) {
 		if (warn != NULL)
 			*warn = 1;
 	}
 	for (arg = tp->t_args; arg != NULL; arg = arg->s_nxt) {
 		if ((t = arg->s_type->t_tspec) == FLOAT ||
 		    t == CHAR || t == SCHAR || t == UCHAR ||
 		    t == SHORT || t == USHORT) {
 			if (warn != NULL)
 				*warn = 1;
 		}
 	}
 	return (1);
 }
 
 /*
  * Compares a prototype declaration with the remembered arguments of
  * a previous old style function definition.
  */
 static int
 chkosdef(sym_t *rdsym, sym_t *dsym)
 {
 	sym_t	*args, *pargs, *arg, *parg;
 	int	narg, nparg, n;
 	int	warn, msg;
 
 	args = rdsym->s_args;
 	pargs = dsym->s_type->t_args;
 
 	msg = 0;
 
 	narg = nparg = 0;
 	for (arg = args; arg != NULL; arg = arg->s_nxt)
 		narg++;
 	for (parg = pargs; parg != NULL; parg = parg->s_nxt)
 		nparg++;
 	if (narg != nparg) {
 		/* prototype does not match old-style definition */
 		error(63);
 		msg = 1;
 		goto end;
 	}
 
 	arg = args;
 	parg = pargs;
 	n = 1;
 	while (narg--) {
 		warn = 0;
 		/*
 		 * If it does not match due to promotion and sflag is
 		 * not set we print only a warning.
 		 */
 		if (!eqtype(arg->s_type, parg->s_type, 1, 1, &warn) || warn) {
 			/* prototype does not match old-style def., arg #%d */
 			error(299, n);
 			msg = 1;
 		}
 		arg = arg->s_nxt;
 		parg = parg->s_nxt;
 		n++;
 	}
 
  end:
 	if (msg)
 		/* old style definition */
 		prevdecl(300, rdsym);
 
 	return (msg);
 }
 
 /*
  * Completes a type by copying the dimension and prototype information
  * from a second compatible type.
  *
  * Following lines are legal:
  *  "typedef a[]; a b; a b[10]; a c; a c[20];"
  *  "typedef ft(); ft f; f(int); ft g; g(long);"
  * This means that, if a type is completed, the type structure must
  * be duplicated.
  */
 void
 compltyp(sym_t *dsym, sym_t *ssym)
 {
 	type_t	**dstp, *src;
 	type_t	*dst;
 
 	dstp = &dsym->s_type;
 	src = ssym->s_type;
 
 	while ((dst = *dstp) != NULL) {
 		if (src == NULL || dst->t_tspec != src->t_tspec)
 			LERROR("compltyp()");
 		if (dst->t_tspec == ARRAY) {
 			if (dst->t_dim == 0 && src->t_dim != 0) {
 				*dstp = dst = duptyp(dst);
 				dst->t_dim = src->t_dim;
 				/* now a complete type */
 				setcompl(dst, 0);
 			}
 		} else if (dst->t_tspec == FUNC) {
 			if (!dst->t_proto && src->t_proto) {
 				*dstp = dst = duptyp(dst);
 				dst->t_proto = 1;
 				dst->t_args = src->t_args;
 			}
 		}
 		dstp = &dst->t_subt;
 		src = src->t_subt;
 	}
 }
 
 /*
  * Completes the declaration of a single argument.
  */
 sym_t *
 decl1arg(sym_t *sym, int initflg)
 {
 	tspec_t	t;
 
 	chkfdef(sym, 1);
 
 	chktyp(sym);
 
 	if (dcs->d_rdcsym != NULL && dcs->d_rdcsym->s_blklev == blklev) {
 		/* redeclaration of formal parameter %s */
 		error(237, sym->s_name);
 		rmsym(dcs->d_rdcsym);
 		sym->s_arg = 1;
 	}
 
 	if (!sym->s_arg) {
 		/* declared argument %s is missing */
 		error(53, sym->s_name);
 		sym->s_arg = 1;
 	}
 
 	if (initflg) {
 		/* cannot initialize parameter: %s */
 		error(52, sym->s_name);
 		initerr = 1;
 	}
 
 	if ((t = sym->s_type->t_tspec) == ARRAY) {
 		sym->s_type = incref(sym->s_type->t_subt, PTR);
 	} else if (t == FUNC) {
 		if (tflag)
 			/* a function is declared as an argument: %s */
 			warning(50, sym->s_name);
 		sym->s_type = incref(sym->s_type, PTR);
 	} else if (t == FLOAT) {
 		if (tflag)
 			sym->s_type = gettyp(DOUBLE);
 	}
 
 	if (dcs->d_inline)
 		/* argument declared inline: %s */
 		warning(269, sym->s_name);
 
 	/*
 	 * Arguments must have complete types. lengths() prints the needed
 	 * error messages (null dimension is impossible because arrays are
 	 * converted to pointers).
 	 */
 	if (sym->s_type->t_tspec != VOID)
 		(void)length(sym->s_type, sym->s_name);
 
 	setsflg(sym);
 
 	return (sym);
 }
 
 /*
  * Does some checks for lint directives which apply to functions.
  * Processes arguments in old style function definitions which default
  * to int.
  * Checks compatibility of old style function definition with previous
  * prototype.
  */
 void
 cluparg(void)
 {
 	sym_t	*args, *arg, *pargs, *parg;
 	int	narg, nparg, n, msg;
 	tspec_t	t;
 
 	args = funcsym->s_args;
 	pargs = funcsym->s_type->t_args;
 
 	/* check for illegal combinations of lint directives */
 	if (prflstrg != -1 && scflstrg != -1) {
 		/* can't be used together: ** PRINTFLIKE ** ** SCANFLIKE ** */
 		warning(289);
 		prflstrg = scflstrg = -1;
 	}
 	if (nvararg != -1 && (prflstrg != -1 || scflstrg != -1)) {
 		/* dubious use of ** VARARGS ** with ** %s ** */
 		warning(288, prflstrg != -1 ? "PRINTFLIKE" : "SCANFLIKE");
 		nvararg = -1;
 	}
 
 	/*
 	 * check if the argument of a lint directive is compatible with the
 	 * number of arguments.
 	 */
 	narg = 0;
 	for (arg = dcs->d_fargs; arg != NULL; arg = arg->s_nxt)
 		narg++;
 	if (nargusg > narg) {
 		/* argument number mismatch with directive: ** %s ** */
 		warning(283, "ARGSUSED");
 		nargusg = 0;
 	}
 	if (nvararg > narg) {
 		/* argument number mismatch with directive: ** %s ** */
 		warning(283, "VARARGS");
 		nvararg = 0;
 	}
 	if (prflstrg > narg) {
 		/* argument number mismatch with directive: ** %s ** */
 		warning(283, "PRINTFLIKE");
 		prflstrg = -1;
 	} else if (prflstrg == 0) {
 		prflstrg = -1;
 	}
 	if (scflstrg > narg) {
 		/* argument number mismatch with directive: ** %s ** */
 		warning(283, "SCANFLIKE");
 		scflstrg = -1;
 	} else if (scflstrg == 0) {
 		scflstrg = -1;
 	}
 	if (prflstrg != -1 || scflstrg != -1) {
 		narg = prflstrg != -1 ? prflstrg : scflstrg;
 		arg = dcs->d_fargs;
 		for (n = 1; n < narg; n++)
 			arg = arg->s_nxt;
 		if (arg->s_type->t_tspec != PTR ||
 		    ((t = arg->s_type->t_subt->t_tspec) != CHAR &&
 		     t != UCHAR && t != SCHAR)) {
 			/* arg. %d must be 'char *' for PRINTFLIKE/SCANFLIKE */
 			warning(293, narg);
 			prflstrg = scflstrg = -1;
 		}
 	}
 
 	/*
 	 * print a warning for each argument of an old style function
 	 * definition which defaults to int
 	 */
 	for (arg = args; arg != NULL; arg = arg->s_nxt) {
 		if (arg->s_defarg) {
 			/* argument type defaults to int: %s */
 			warning(32, arg->s_name);
 			arg->s_defarg = 0;
 			setsflg(arg);
 		}
 	}
 
 	/*
 	 * If this is an old style function definition and a prototyp
 	 * exists, compare the types of arguments.
 	 */
 	if (funcsym->s_osdef && funcsym->s_type->t_proto) {
 		/*
 		 * If the number of arguments does not macht, we need not
 		 * continue.
 		 */
 		narg = nparg = 0;
 		msg = 0;
 		for (parg = pargs; parg != NULL; parg = parg->s_nxt)
 			nparg++;
 		for (arg = args; arg != NULL; arg = arg->s_nxt)
 			narg++;
 		if (narg != nparg) {
 			/* parameter mismatch: %d declared, %d defined */
 			error(51, nparg, narg);
 			msg = 1;
 		} else {
 			parg = pargs;
 			arg = args;
 			while (narg--) {
 				msg |= chkptdecl(arg, parg);
 				parg = parg->s_nxt;
 				arg = arg->s_nxt;
 			}
 		}
 		if (msg)
 			/* prototype declaration */
 			prevdecl(285, dcs->d_rdcsym);
 
 		/* from now the prototype is valid */
 		funcsym->s_osdef = 0;
 		funcsym->s_args = NULL;
 
 	}
 
 }
 
 /*
  * Checks compatibility of an old style function definition with a previous
  * prototype declaration.
  * Returns 1 if the position of the previous declaration should be reported.
  */
 static int
 chkptdecl(sym_t *arg, sym_t *parg)
 {
 	type_t	*tp, *ptp;
 	int	warn, msg;
 
 	tp = arg->s_type;
 	ptp = parg->s_type;
 
 	msg = 0;
 	warn = 0;
 
 	if (!eqtype(tp, ptp, 1, 1, &warn)) {
 		if (eqtype(tp, ptp, 1, 0, &warn)) {
 			/* type does not match prototype: %s */
 			msg = gnuism(58, arg->s_name);
 		} else {
 			/* type does not match prototype: %s */
 			error(58, arg->s_name);
 			msg = 1;
 		}
 	} else if (warn) {
 		/* type does not match prototype: %s */
 		(*(sflag ? error : warning))(58, arg->s_name);
 		msg = 1;
 	}
 
 	return (msg);
 }
 
 /*
  * Completes a single local declaration/definition.
  */
 void
 decl1loc(sym_t *dsym, int initflg)
 {
 
 	/* Correct a mistake done in dname(). */
 	if (dsym->s_type->t_tspec == FUNC) {
 		dsym->s_def = DECL;
 		if (dcs->d_scl == NOSCL)
 			dsym->s_scl = EXTERN;
 	}
 
 	if (dsym->s_type->t_tspec == FUNC) {
 		if (dsym->s_scl == STATIC) {
 			/* dubious static function at block level: %s */
 			warning(93, dsym->s_name);
 			dsym->s_scl = EXTERN;
 		} else if (dsym->s_scl != EXTERN && dsym->s_scl != TYPEDEF) {
 			/* function has illegal storage class: %s */
 			error(94, dsym->s_name);
 			dsym->s_scl = EXTERN;
 		}
 	}
 
 	/*
 	 * functions may be declared inline at local scope, although
 	 * this has no effect for a later definition of the same
 	 * function.
 	 * XXX it should have an effect if tflag is set. this would
 	 * also be the way gcc behaves.
 	 */
 	if (dcs->d_inline) {
 		if (dsym->s_type->t_tspec == FUNC) {
 			dsym->s_inline = 1;
 		} else {
 			/* variable declared inline: %s */
 			warning(268, dsym->s_name);
 		}
 	}
 
 	chkfdef(dsym, 1);
 
 	chktyp(dsym);
 
 	if (dcs->d_rdcsym != NULL && dsym->s_scl == EXTERN)
 		ledecl(dsym);
 
 	if (dsym->s_scl == EXTERN) {
 		/*
 		 * XXX wenn die statische Variable auf Ebene 0 erst
 		 * spaeter definiert wird, haben wir die Brille auf.
 		 */
 		if (dsym->s_xsym == NULL) {
 			outsym(dsym, EXTERN, dsym->s_def);
 		} else {
 			outsym(dsym, dsym->s_xsym->s_scl, dsym->s_def);
 		}
 	}
 
 	if (dcs->d_rdcsym != NULL) {
 
 		if (dcs->d_rdcsym->s_blklev == 0) {
 
 			switch (dsym->s_scl) {
 			case AUTO:
 				/* automatic hides external declaration: %s */
 				if (hflag)
 					warning(86, dsym->s_name);
 				break;
 			case STATIC:
 				/* static hides external declaration: %s */
 				if (hflag)
 					warning(87, dsym->s_name);
 				break;
 			case TYPEDEF:
 				/* typedef hides  external declaration: %s */
 				if (hflag)
 					warning(88, dsym->s_name);
 				break;
 			case EXTERN:
 				/*
 				 * Warnings and errors are printed in ledecl()
 				 */
 				break;
 			default:
 				LERROR("decl1loc()");
 			}
 
 		} else if (dcs->d_rdcsym->s_blklev == blklev) {
 
 			/* no hflag, because its illegal! */
 			if (dcs->d_rdcsym->s_arg) {
 				/*
 				 * if !tflag, a "redeclaration of %s" error
 				 * is produced below
 				 */
 				if (tflag) {
 					if (hflag)
 						/* decl. hides parameter: %s */
 						warning(91, dsym->s_name);
 					rmsym(dcs->d_rdcsym);
 				}
 			}
 
 		} else if (dcs->d_rdcsym->s_blklev < blklev) {
 
 			if (hflag)
 				/* declaration hides earlier one: %s */
 				warning(95, dsym->s_name);
 
 		}
 
 		if (dcs->d_rdcsym->s_blklev == blklev) {
 
 			/* redeclaration of %s */
 			error(27, dsym->s_name);
 			rmsym(dcs->d_rdcsym);
 
 		}
 
 	}
 
 	if (initflg && !(initerr = chkinit(dsym))) {
 		dsym->s_def = DEF;
 		setsflg(dsym);
 	}
 
 	if (dsym->s_scl == TYPEDEF) {
 		dsym->s_type = duptyp(dsym->s_type);
 		dsym->s_type->t_typedef = 1;
 		settdsym(dsym->s_type, dsym);
 	}
 
 	/*
 	 * Before we can check the size we must wait for an initialisation
 	 * which may follow.
 	 */
 }
 
 /*
  * Processes (re)declarations of external Symbols inside blocks.
  */
 static void
 ledecl(sym_t *dsym)
 {
 	int	eqt, warn;
 	sym_t	*esym;
 
 	/* look for a symbol with the same name */
 	esym = dcs->d_rdcsym;
 	while (esym != NULL && esym->s_blklev != 0) {
 		while ((esym = esym->s_link) != NULL) {
 			if (esym->s_kind != FVFT)
 				continue;
 			if (strcmp(dsym->s_name, esym->s_name) == 0)
 				break;
 		}
 	}
 	if (esym == NULL)
 		return;
 	if (esym->s_scl != EXTERN && esym->s_scl != STATIC) {
 		/* gcc accepts this without a warning, pcc prints an error. */
 		/* redeclaration of %s */
 		warning(27, dsym->s_name);
 		prevdecl(-1, esym);
 		return;
 	}
 
 	warn = 0;
 	eqt = eqtype(esym->s_type, dsym->s_type, 0, 0, &warn);
 
 	if (!eqt || warn) {
 		if (esym->s_scl == EXTERN) {
 			/* inconsistent redeclaration of extern: %s */
 			warning(90, dsym->s_name);
 			prevdecl(-1, esym);
 		} else {
 			/* inconsistent redeclaration of static: %s */
 			warning(92, dsym->s_name);
 			prevdecl(-1, esym);
 		}
 	}
 
 	if (eqt) {
 		/*
 		 * Remember the external symbol so we can update usage
 		 * information at the end of the block.
 		 */
 		dsym->s_xsym = esym;
 	}
 }
 
 /*
  * Print an error or a warning if the symbol can't be initialized due
  * to type/storage class. Return value is 1 if an error has been
  * detected.
  */
 static int
 chkinit(sym_t *sym)
 {
 	int	err;
 
 	err = 0;
 
 	if (sym->s_type->t_tspec == FUNC) {
 		/* cannot initialize function: %s */
 		error(24, sym->s_name);
 		err = 1;
 	} else if (sym->s_scl == TYPEDEF) {
 		/* cannot initialize typedef: %s */
 		error(25, sym->s_name);
 		err = 1;
 	} else if (sym->s_scl == EXTERN && sym->s_def == DECL) {
 		/* cannot initialize "extern" declaration: %s */
 		if (dcs->d_ctx == EXTERN) {
 			warning(26, sym->s_name);
 		} else {
 			error(26, sym->s_name);
 			err = 1;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * Create a symbol for an abstract declaration.
  */
 sym_t *
 aname(void)
 {
 	sym_t	*sym;
 
 	if (dcs->d_ctx != ABSTRACT && dcs->d_ctx != PARG)
 		LERROR("aname()");
 
 	sym = getblk(sizeof (sym_t));
 
 	sym->s_name = unnamed;
 	sym->s_def = DEF;
 	sym->s_scl = ABSTRACT;
 	sym->s_blklev = -1;
 
 	if (dcs->d_ctx == PARG)
 		sym->s_arg = 1;
 
 	sym->s_type = dcs->d_type;
 	dcs->d_rdcsym = NULL;
 	dcs->d_vararg = 0;
 
 	return (sym);
 }
 
 /*
  * Removes anything which has nothing to do on global level.
  */
 void
 globclup(void)
 {
 
 	while (dcs->d_nxt != NULL)
 		popdecl();
 
 	cleanup();
 	blklev = 0;
 	mblklev = 0;
 
 	/*
 	 * remove all information about pending lint directives without
 	 * warnings.
 	 */
 	glclup(1);
 }
 
 /*
  * Process an abstract type declaration
  */
 sym_t *
 decl1abs(sym_t *sym)
 {
 
 	chkfdef(sym, 1);
 	chktyp(sym);
 	return (sym);
 }
 
 /*
  * Checks size after declarations of variables and their initialisation.
  */
 void
 chksz(sym_t *dsym)
 {
 
 	/*
 	 * check size only for symbols which are defined and no function and
 	 * not typedef name
 	 */
 	if (dsym->s_def != DEF)
 		return;
 	if (dsym->s_scl == TYPEDEF)
 		return;
 	if (dsym->s_type->t_tspec == FUNC)
 		return;
 
 	if (length(dsym->s_type, dsym->s_name) == 0 &&
 	    dsym->s_type->t_tspec == ARRAY && dsym->s_type->t_dim == 0) {
 		/* empty array declaration: %s */
 		if (tflag) {
 			warning(190, dsym->s_name);
 		} else {
 			error(190, dsym->s_name);
 		}
 	}
 }
 
 /*
  * Mark an object as set if it is not already
  */
 void
 setsflg(sym_t *sym)
 {
 
 	if (!sym->s_set) {
 		sym->s_set = 1;
 		UNIQUE_CURR_POS(sym->s_spos);
 	}
 }
 
 /*
  * Mark an object as used if it is not already
  */
 void
 setuflg(sym_t *sym, int fcall, int szof)
 {
 
 	if (!sym->s_used) {
 		sym->s_used = 1;
 		UNIQUE_CURR_POS(sym->s_upos);
 	}
 	/*
 	 * for function calls another record is written
 	 *
 	 * XXX Should symbols used in sizeof() treated as used or not?
 	 * Probably not, because there is no sense to declare an
 	 * external variable only to get their size.
 	 */
 	if (!fcall && !szof && sym->s_kind == FVFT && sym->s_scl == EXTERN)
 		outusg(sym);
 }
 
 /*
  * Prints warnings for a list of variables and labels (concatenated
  * with s_dlnxt) if these are not used or only set.
  */
 void
 chkusage(dinfo_t *di)
 {
 	sym_t	*sym;
 	int	mknowarn;
 
 	/* for this warnings LINTED has no effect */
 	mknowarn = nowarn;
 	nowarn = 0;
 
 	for (sym = di->d_dlsyms; sym != NULL; sym = sym->s_dlnxt)
 		chkusg1(di->d_asm, sym);
 
 	nowarn = mknowarn;
 }
 
 /*
  * Prints a warning for a single variable or label if it is not used or
  * only set.
  */
 void
 chkusg1(int novar, sym_t *sym)
 {
 	pos_t	cpos;
 
 	if (sym->s_blklev == -1)
 		return;
 
 	STRUCT_ASSIGN(cpos, curr_pos);
 
 	if (sym->s_kind == FVFT) {
 		if (sym->s_arg) {
 			chkausg(novar, sym);
 		} else {
 			chkvusg(novar, sym);
 		}
 	} else if (sym->s_kind == FLAB) {
 		chklusg(sym);
 	} else if (sym->s_kind == FTAG) {
 		chktusg(sym);
 	}
 
 	STRUCT_ASSIGN(curr_pos, cpos);
 }
 
 static void
 chkausg(int novar, sym_t *arg)
 {
 
 	if (!arg->s_set)
 		LERROR("chkausg()");
 
 	if (novar)
 		return;
 
 	if (!arg->s_used && vflag) {
 		STRUCT_ASSIGN(curr_pos, arg->s_dpos);
 		/* argument %s unused in function %s */
 		warning(231, arg->s_name, funcsym->s_name);
 	}
 }
 
 static void
 chkvusg(int novar, sym_t *sym)
 {
 	scl_t	sc;
 	sym_t	*xsym;
 
 	if (blklev == 0 || sym->s_blklev == 0)
 		LERROR("chkvusg()");
 
 	/* errors in expressions easily cause lots of these warnings */
 	if (nerr != 0)
 		return;
 
 	/*
 	 * XXX Only variables are checkd, although types should
 	 * probably also be checked
 	 */
 	if ((sc = sym->s_scl) != EXTERN && sc != STATIC &&
 	    sc != AUTO && sc != REG) {
 		return;
 	}
 
 	if (novar)
 		return;
 
 	if (sc == EXTERN) {
 		if (!sym->s_used && !sym->s_set) {
 			STRUCT_ASSIGN(curr_pos, sym->s_dpos);
 			/* %s unused in function %s */
 			warning(192, sym->s_name, funcsym->s_name);
 		}
 	} else {
 		if (sym->s_set && !sym->s_used) {
 			STRUCT_ASSIGN(curr_pos, sym->s_spos);
 			/* %s set but not used in function %s */
 			warning(191, sym->s_name, funcsym->s_name);
 		} else if (!sym->s_used) {
 			STRUCT_ASSIGN(curr_pos, sym->s_dpos);
 			/* %s unused in function %s */
 			warning(192, sym->s_name, funcsym->s_name);
 		}
 	}
 
 	if (sc == EXTERN) {
 		/*
 		 * information about usage is taken over into the symbol
 		 * tabel entry at level 0 if the symbol was locally declared
 		 * as an external symbol.
 		 *
 		 * XXX This is wrong for symbols declared static at level 0
 		 * if the usage information stems from sizeof(). This is
 		 * because symbols at level 0 only used in sizeof() are
 		 * considered to not be used.
 		 */
 		if ((xsym = sym->s_xsym) != NULL) {
 			if (sym->s_used && !xsym->s_used) {
 				xsym->s_used = 1;
 				STRUCT_ASSIGN(xsym->s_upos, sym->s_upos);
 			}
 			if (sym->s_set && !xsym->s_set) {
 				xsym->s_set = 1;
 				STRUCT_ASSIGN(xsym->s_spos, sym->s_spos);
 			}
 		}
 	}
 }
 
 static void
 chklusg(sym_t *lab)
 {
 
 	if (blklev != 1 || lab->s_blklev != 1)
 		LERROR("chklusg()");
 
 	if (lab->s_set && !lab->s_used) {
 		STRUCT_ASSIGN(curr_pos, lab->s_spos);
 		/* label %s unused in function %s */
 		warning(192, lab->s_name, funcsym->s_name);
 	} else if (!lab->s_set) {
 		STRUCT_ASSIGN(curr_pos, lab->s_upos);
 		/* undefined label %s */
 		warning(23, lab->s_name);
 	}
 }
 
 static void
 chktusg(sym_t *sym)
 {
 
 	if (!incompl(sym->s_type))
 		return;
 
 	/* complain always about incomplete tags declared inside blocks */
 	if (!zflag || dcs->d_ctx != EXTERN)
 		return;
 
 	STRUCT_ASSIGN(curr_pos, sym->s_dpos);
 	switch (sym->s_type->t_tspec) {
 	case STRUCT:
 		/* struct %s never defined */
 		warning(233, sym->s_name);
 		break;
 	case UNION:
 		/* union %s never defined */
 		warning(234, sym->s_name);
 		break;
 	case ENUM:
 		/* enum %s never defined */
 		warning(235, sym->s_name);
 		break;
 	default:
 		LERROR("chktusg()");
 	}
 }
 
 /*
  * Called after the entire translation unit has been parsed.
  * Changes tentative definitions in definitions.
  * Performs some tests on global Symbols. Detected Problems are:
  * - defined variables of incomplete type
  * - constant variables which are not initialized
  * - static symbols which are never used
  */
 void
 chkglsyms(void)
 {
 	sym_t	*sym;
 	pos_t	cpos;
 
 	if (blklev != 0 || dcs->d_nxt != NULL)
 		norecover();
 
 	STRUCT_ASSIGN(cpos, curr_pos);
 
 	for (sym = dcs->d_dlsyms; sym != NULL; sym = sym->s_dlnxt) {
 		if (sym->s_blklev == -1)
 			continue;
 		if (sym->s_kind == FVFT) {
 			chkglvar(sym);
 		} else if (sym->s_kind == FTAG) {
 			chktusg(sym);
 		} else {
 			if (sym->s_kind != FMOS)
 				LERROR("chkglsyms()");
 		}
 	}
 
 	STRUCT_ASSIGN(curr_pos, cpos);
 }
 
 static void
 chkglvar(sym_t *sym)
 {
 
 	if (sym->s_scl == TYPEDEF || sym->s_scl == ENUMCON)
 		return;
 
 	if (sym->s_scl != EXTERN && sym->s_scl != STATIC)
 		LERROR("chkglvar()");
 
 	glchksz(sym);
 
 	if (sym->s_scl == STATIC) {
 		if (sym->s_type->t_tspec == FUNC) {
 			if (sym->s_used && sym->s_def != DEF) {
 				STRUCT_ASSIGN(curr_pos, sym->s_upos);
 				/* static func. called but not def.. */
 				error(225, sym->s_name);
 			}
 		}
 		if (!sym->s_used) {
 			STRUCT_ASSIGN(curr_pos, sym->s_dpos);
 			if (sym->s_type->t_tspec == FUNC) {
 				if (sym->s_def == DEF) {
 					if (!sym->s_inline)
 						/* static function %s unused */
 						warning(236, sym->s_name);
 				} else {
 					/* static function %s decl. but ... */
 					warning(290, sym->s_name);
 				}
 			} else if (!sym->s_set) {
 				/* static variable %s unused */
 				warning(226, sym->s_name);
 			} else {
 				/* static variable %s set but not used */
 				warning(307, sym->s_name);
 			}
 		}
 		if (!tflag && sym->s_def == TDEF && sym->s_type->t_const) {
 			STRUCT_ASSIGN(curr_pos, sym->s_dpos);
 			/* const object %s should have initializer */
 			warning(227, sym->s_name);
 		}
 	}
 }
 
 static void
 glchksz(sym_t *sym)
 {
 
 	if (sym->s_def == TDEF) {
 		if (sym->s_type->t_tspec == FUNC)
 			/*
 			 * this can happen if a syntax error occurred
 			 * after a function declaration
 			 */
 			return;
 		STRUCT_ASSIGN(curr_pos, sym->s_dpos);
 		if (length(sym->s_type, sym->s_name) == 0 &&
 		    sym->s_type->t_tspec == ARRAY && sym->s_type->t_dim == 0) {
 			/* empty array declaration: %s */
 			if (tflag || (sym->s_scl == EXTERN && !sflag)) {
 				warning(190, sym->s_name);
 			} else {
 				error(190, sym->s_name);
 			}
 		}
 	}
 }
 
 /*
  * Prints information about location of previous definition/declaration.
  */
 void
 prevdecl(int msg, sym_t *psym)
 {
 	pos_t	cpos;
 
 	if (!rflag)
 		return;
 
 	STRUCT_ASSIGN(cpos, curr_pos);
 	STRUCT_ASSIGN(curr_pos, psym->s_dpos);
 	if (msg != -1) {
 		message(msg, psym->s_name);
 	} else if (psym->s_def == DEF || psym->s_def == TDEF) {
 		/* previous definition of %s */
 		message(261, psym->s_name);
 	} else {
 		/* previous declaration of %s */
 		message(260, psym->s_name);
 	}
 	STRUCT_ASSIGN(curr_pos, cpos);
 }
Index: user/ngie/bsnmp_cleanup/usr.sbin/bhyve/fwctl.c
===================================================================
--- user/ngie/bsnmp_cleanup/usr.sbin/bhyve/fwctl.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/usr.sbin/bhyve/fwctl.c	(revision 298468)
@@ -1,549 +1,549 @@
 /*-
  * Copyright (c) 2015  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does,
  * but with a request/response messaging protocol.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "bhyverun.h"
 #include "inout.h"
 #include "fwctl.h"
 
 /*
  * Messaging protocol base operations
  */
 #define	OP_NULL		1
 #define	OP_ECHO		2
 #define	OP_GET		3
 #define	OP_GET_LEN	4
 #define	OP_SET		5
 #define	OP_MAX		OP_SET
 
 /* I/O ports */
 #define	FWCTL_OUT	0x510
 #define	FWCTL_IN	0x511
 
 /*
  * Back-end state-machine
  */
 enum state {
 	DORMANT,
 	IDENT_WAIT,
 	IDENT_SEND,
 	REQ,
 	RESP
 } be_state = DORMANT;
 
 static uint8_t sig[] = { 'B', 'H', 'Y', 'V' };
 static u_int ident_idx;
 
 struct op_info {
 	int op;
 	int  (*op_start)(int len);
 	void (*op_data)(uint32_t data, int len);
 	int  (*op_result)(struct iovec **data);
 	void (*op_done)(struct iovec *data);
 };
 static struct op_info *ops[OP_MAX+1];
 
 /* Return 0-padded uint32_t */
 static uint32_t
 fwctl_send_rest(uint32_t *data, size_t len)
 {
 	union {
 		uint8_t c[4];
 		uint32_t w;
 	} u;
 	uint8_t *cdata;
 	int i;
 
 	cdata = (uint8_t *) data;
 	u.w = 0;	
 
 	for (i = 0, u.w = 0; i < len; i++)
 		u.c[i] = *cdata++;
 
 	return (u.w);
 }
 
 /*
  * error op dummy proto - drop all data sent and return an error
 */
 static int errop_code;
 
 static void
 errop_set(int err)
 {
 
 	errop_code = err;
 }
 
 static int
 errop_start(int len)
 {
 	errop_code = ENOENT;
 
 	/* accept any length */
 	return (errop_code);
 }
 
 static void
 errop_data(uint32_t data, int len)
 {
 
 	/* ignore */
 }
 
 static int
 errop_result(struct iovec **data)
 {
 
 	/* no data to send back; always successful */
 	*data = NULL;
 	return (errop_code);
 }
 
 static void
 errop_done(struct iovec *data)
 {
 
 	/* assert data is NULL */
 }
 
 static struct op_info errop_info = {
 	.op_start  = errop_start,
 	.op_data   = errop_data,
 	.op_result = errop_result,
 	.op_done   = errop_done
 };
 
 /* OID search */
 SET_DECLARE(ctl_set, struct ctl);
 
 CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus));
 
 static struct ctl *
 ctl_locate(const char *str, int maxlen)
 {
 	struct ctl *cp, **cpp;
 
 	SET_FOREACH(cpp, ctl_set)  {
 		cp = *cpp;
 		if (!strncmp(str, cp->c_oid, maxlen))
 			return (cp);
 	}
 
 	return (NULL);
 }
 
 /* uefi-sysctl get-len */
 #define FGET_STRSZ	80
 static struct iovec fget_biov[2];
 static char fget_str[FGET_STRSZ];
 static struct {
 	size_t f_sz;
 	uint32_t f_data[1024];
 } fget_buf;
 static int fget_cnt;
 static size_t fget_size;
 
 static int
 fget_start(int len)
 {
 
 	if (len > FGET_STRSZ)
 		return(E2BIG);
 
 	fget_cnt = 0;
 
 	return (0);
 }
 
 static void
 fget_data(uint32_t data, int len)
 {
 
 	*((uint32_t *) &fget_str[fget_cnt]) = data;
 	fget_cnt += sizeof(uint32_t);
 }
 
 static int
 fget_result(struct iovec **data, int val)
 {
 	struct ctl *cp;
 	int err;
 
 	err = 0;
 
 	/* Locate the OID */
 	cp = ctl_locate(fget_str, fget_cnt);
 	if (cp == NULL) {
 		*data = NULL;
 		err = ENOENT;
 	} else {
 		if (val) {
 			/* For now, copy the len/data into a buffer */
 			memset(&fget_buf, 0, sizeof(fget_buf));
 			fget_buf.f_sz = cp->c_len;
 			memcpy(fget_buf.f_data, cp->c_data, cp->c_len);
 			fget_biov[0].iov_base = (char *)&fget_buf;
 			fget_biov[0].iov_len  = sizeof(fget_buf.f_sz) +
 				cp->c_len;
 		} else {
 			fget_size = cp->c_len;
 			fget_biov[0].iov_base = (char *)&fget_size;
 			fget_biov[0].iov_len  = sizeof(fget_size);
 		}
 
 		fget_biov[1].iov_base = NULL;
 		fget_biov[1].iov_len  = 0;
 		*data = fget_biov;
 	}
 
 	return (err);
 }
 
 static void
 fget_done(struct iovec *data)
 {
 
 	/* nothing needs to be freed */
 }
 
 static int
 fget_len_result(struct iovec **data)
 {
 	return (fget_result(data, 0));
 }
 
 static int
 fget_val_result(struct iovec **data)
 {
 	return (fget_result(data, 1));
 }
 
 static struct op_info fgetlen_info = {
 	.op_start  = fget_start,
 	.op_data   = fget_data,
 	.op_result = fget_len_result,
 	.op_done   = fget_done
 };
 
 static struct op_info fgetval_info = {
 	.op_start  = fget_start,
 	.op_data   = fget_data,
 	.op_result = fget_val_result,
 	.op_done   = fget_done
 };
 
 static struct req_info {
 	int      req_error;
 	u_int    req_count;
 	uint32_t req_size;
 	uint32_t req_type;
 	uint32_t req_txid;
 	struct op_info *req_op;
 	int	 resp_error;
 	int	 resp_count;
 	int	 resp_size;
 	int	 resp_off;
 	struct iovec *resp_biov;
 } rinfo;
 
 static void
 fwctl_response_done(void)
 {
 
 	(*rinfo.req_op->op_done)(rinfo.resp_biov);
 
 	/* reinit the req data struct */
 	memset(&rinfo, 0, sizeof(rinfo));
 }
 
 static void
 fwctl_request_done(void)
 {
 
 	rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov);
 
 	/* XXX only a single vector supported at the moment */
 	rinfo.resp_off = 0;
 	if (rinfo.resp_biov == NULL) {
 		rinfo.resp_size = 0;
 	} else {
 		rinfo.resp_size = rinfo.resp_biov[0].iov_len;
 	}
 }
 
 static int
 fwctl_request_start(void)
 {
 	int err;
 
 	/* Data size doesn't include header */
 	rinfo.req_size -= 12;
 
 	rinfo.req_op = &errop_info;
 	if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL)
 		rinfo.req_op = ops[rinfo.req_type];
 
 	err = (*rinfo.req_op->op_start)(rinfo.req_size);
 
 	if (err) {
 		errop_set(err);
 		rinfo.req_op = &errop_info;
 	}
 
 	/* Catch case of zero-length message here */
 	if (rinfo.req_size == 0) {
 		fwctl_request_done();
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 fwctl_request_data(uint32_t value)
 {
 	int remlen;
 
 	/* Make sure remaining size is >= 0 */
 	rinfo.req_size -= sizeof(uint32_t);
-	remlen = (rinfo.req_size > 0) ? rinfo.req_size: 0;
+	remlen = MAX(rinfo.req_size, 0);
 
 	(*rinfo.req_op->op_data)(value, remlen);
 
 	if (rinfo.req_size < sizeof(uint32_t)) {
 		fwctl_request_done();
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 fwctl_request(uint32_t value)
 {
 
 	int ret;
 
 	ret = 0;
 
 	switch (rinfo.req_count) {
 	case 0:
 		/* Verify size */
 		if (value < 12) {
 			printf("msg size error");
 			exit(1);
 		}
 		rinfo.req_size = value;
 		rinfo.req_count = 1;
 		break;
 	case 1:
 		rinfo.req_type = value;
 		rinfo.req_count++;
 		break;
 	case 2:
 		rinfo.req_txid = value;
 		rinfo.req_count++;
 		ret = fwctl_request_start();
 		break;
 	default:
 		ret = fwctl_request_data(value);
 		break;
 	}
 
 	return (ret);
 }
 
 static int
 fwctl_response(uint32_t *retval)
 {
 	uint32_t *dp;
 	int remlen;
 
 	switch(rinfo.resp_count) {
 	case 0:
 		/* 4 x u32 header len + data */
 		*retval = 4*sizeof(uint32_t) +
 		    roundup(rinfo.resp_size, sizeof(uint32_t));
 		rinfo.resp_count++;
 		break;
 	case 1:
 		*retval = rinfo.req_type;
 		rinfo.resp_count++;
 		break;
 	case 2:
 		*retval = rinfo.req_txid;
 		rinfo.resp_count++;
 		break;
 	case 3:
 		*retval = rinfo.resp_error;
 		rinfo.resp_count++;
 		break;
 	default:
 		remlen = rinfo.resp_size - rinfo.resp_off;
 		dp = (uint32_t *)
 		    ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off);
 		if (remlen >= sizeof(uint32_t)) {
 			*retval = *dp;
 		} else if (remlen > 0) {
 			*retval = fwctl_send_rest(dp, remlen);
 		}
 		rinfo.resp_off += sizeof(uint32_t);
 		break;
 	}
 
 	if (rinfo.resp_count > 3 &&
 	    rinfo.resp_size - rinfo.resp_off <= 0) {
 		fwctl_response_done();
 		return (1);
 	}
 
 	return (0);
 }
 
 
 /*
  * i/o port handling.
  */
 static uint8_t
 fwctl_inb(void)
 {
 	uint8_t retval;
 
 	retval = 0xff;
 
 	switch (be_state) {
 	case IDENT_SEND:
 		retval = sig[ident_idx++];
 		if (ident_idx >= sizeof(sig))
 			be_state = REQ;
 		break;
 	default:
 		break;
 	}
 
 	return (retval);
 }
 
 static void
 fwctl_outw(uint16_t val)
 {
 	switch (be_state) {
 	case IDENT_WAIT:
 		if (val == 0) {
 			be_state = IDENT_SEND;
 			ident_idx = 0;
 		}
 		break;
 	default:
 		/* ignore */
 		break;
 	}
 }
 
 static uint32_t
 fwctl_inl(void)
 {
 	uint32_t retval;
 
 	switch (be_state) {
 	case RESP:
 		if (fwctl_response(&retval))
 			be_state = REQ;
 		break;
 	default:
 		retval = 0xffffffff;
 		break;
 	}
 
 	return (retval);
 }
 
 static void
 fwctl_outl(uint32_t val)
 {
 
 	switch (be_state) {
 	case REQ:
 		if (fwctl_request(val))
 			be_state = RESP;
 	default:
 		break;
 	}
 
 }
 
 static int
 fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
 
 	if (in) {
 		if (bytes == 1)
 			*eax = fwctl_inb();
 		else if (bytes == 4)
 			*eax = fwctl_inl();
 		else
 			*eax = 0xffff;
 	} else {
 		if (bytes == 2)
 			fwctl_outw(*eax);
 		else if (bytes == 4)
 			fwctl_outl(*eax);
 	}
 
 	return (0);
 }
 INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler);
 INOUT_PORT(fwctl_rreg, FWCTL_IN,  IOPORT_F_IN,    fwctl_handler);
 
 void
 fwctl_init(void)
 {
 
 	ops[OP_GET_LEN] = &fgetlen_info;
 	ops[OP_GET]     = &fgetval_info;
 
 	be_state = IDENT_WAIT;
 }
Index: user/ngie/bsnmp_cleanup/usr.sbin/bhyve/pci_ahci.c
===================================================================
--- user/ngie/bsnmp_cleanup/usr.sbin/bhyve/pci_ahci.c	(revision 298467)
+++ user/ngie/bsnmp_cleanup/usr.sbin/bhyve/pci_ahci.c	(revision 298468)
@@ -1,2347 +1,2347 @@
 /*-
  * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 #include <sys/ata.h>
 #include <sys/endian.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <inttypes.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "ahci.h"
 #include "block_if.h"
 
 #define	MAX_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
 
 #define	PxSIG_ATA	0x00000101 /* ATA drive */
 #define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
 
 enum sata_fis_type {
 	FIS_TYPE_REGH2D		= 0x27,	/* Register FIS - host to device */
 	FIS_TYPE_REGD2H		= 0x34,	/* Register FIS - device to host */
 	FIS_TYPE_DMAACT		= 0x39,	/* DMA activate FIS - device to host */
 	FIS_TYPE_DMASETUP	= 0x41,	/* DMA setup FIS - bidirectional */
 	FIS_TYPE_DATA		= 0x46,	/* Data FIS - bidirectional */
 	FIS_TYPE_BIST		= 0x58,	/* BIST activate FIS - bidirectional */
 	FIS_TYPE_PIOSETUP	= 0x5F,	/* PIO setup FIS - device to host */
 	FIS_TYPE_SETDEVBITS	= 0xA1,	/* Set dev bits FIS - device to host */
 };
 
 /*
  * SCSI opcodes
  */
 #define	TEST_UNIT_READY		0x00
 #define	REQUEST_SENSE		0x03
 #define	INQUIRY			0x12
 #define	START_STOP_UNIT		0x1B
 #define	PREVENT_ALLOW		0x1E
 #define	READ_CAPACITY		0x25
 #define	READ_10			0x28
 #define	POSITION_TO_ELEMENT	0x2B
 #define	READ_TOC		0x43
 #define	GET_EVENT_STATUS_NOTIFICATION 0x4A
 #define	MODE_SENSE_10		0x5A
 #define	REPORT_LUNS		0xA0
 #define	READ_12			0xA8
 #define	READ_CD			0xBE
 
 /*
  * SCSI mode page codes
  */
 #define	MODEPAGE_RW_ERROR_RECOVERY	0x01
 #define	MODEPAGE_CD_CAPABILITIES	0x2A
 
 /*
  * ATA commands
  */
 #define	ATA_SF_ENAB_SATA_SF		0x10
 #define		ATA_SATA_SF_AN		0x05
 #define	ATA_SF_DIS_SATA_SF		0x90
 
 /*
  * Debug printf
  */
 #ifdef AHCI_DEBUG
 static FILE *dbg;
 #define DPRINTF(format, arg...)	do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
 #else
 #define DPRINTF(format, arg...)
 #endif
 #define WPRINTF(format, arg...) printf(format, ##arg)
 
 struct ahci_ioreq {
 	struct blockif_req io_req;
 	struct ahci_port *io_pr;
 	STAILQ_ENTRY(ahci_ioreq) io_flist;
 	TAILQ_ENTRY(ahci_ioreq) io_blist;
 	uint8_t *cfis;
 	uint32_t len;
 	uint32_t done;
 	int slot;
 	int more;
 };
 
 struct ahci_port {
 	struct blockif_ctxt *bctx;
 	struct pci_ahci_softc *pr_sc;
 	uint8_t *cmd_lst;
 	uint8_t *rfis;
 	char ident[20 + 1];
 	int atapi;
 	int reset;
 	int waitforclear;
 	int mult_sectors;
 	uint8_t xfermode;
 	uint8_t err_cfis[20];
 	uint8_t sense_key;
 	uint8_t asc;
 	u_int ccs;
 	uint32_t pending;
 
 	uint32_t clb;
 	uint32_t clbu;
 	uint32_t fb;
 	uint32_t fbu;
 	uint32_t is;
 	uint32_t ie;
 	uint32_t cmd;
 	uint32_t unused0;
 	uint32_t tfd;
 	uint32_t sig;
 	uint32_t ssts;
 	uint32_t sctl;
 	uint32_t serr;
 	uint32_t sact;
 	uint32_t ci;
 	uint32_t sntf;
 	uint32_t fbs;
 
 	/*
 	 * i/o request info
 	 */
 	struct ahci_ioreq *ioreq;
 	int ioqsz;
 	STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
 	TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
 };
 
 struct ahci_cmd_hdr {
 	uint16_t flags;
 	uint16_t prdtl;
 	uint32_t prdbc;
 	uint64_t ctba;
 	uint32_t reserved[4];
 };
 
 struct ahci_prdt_entry {
 	uint64_t dba;
 	uint32_t reserved;
 #define	DBCMASK		0x3fffff
 	uint32_t dbc;
 };
 
 struct pci_ahci_softc {
 	struct pci_devinst *asc_pi;
 	pthread_mutex_t	mtx;
 	int ports;
 	uint32_t cap;
 	uint32_t ghc;
 	uint32_t is;
 	uint32_t pi;
 	uint32_t vs;
 	uint32_t ccc_ctl;
 	uint32_t ccc_pts;
 	uint32_t em_loc;
 	uint32_t em_ctl;
 	uint32_t cap2;
 	uint32_t bohc;
 	uint32_t lintr;
 	struct ahci_port port[MAX_PORTS];
 };
 #define	ahci_ctx(sc)	((sc)->asc_pi->pi_vmctx)
 
 static void ahci_handle_port(struct ahci_port *p);
 
 static inline void lba_to_msf(uint8_t *buf, int lba)
 {
 	lba += 150;
 	buf[0] = (lba / 75) / 60;
 	buf[1] = (lba / 75) % 60;
 	buf[2] = lba % 75;
 }
 
 /*
  * generate HBA intr depending on whether or not ports within
  * the controller have an interrupt pending.
  */
 static void
 ahci_generate_intr(struct pci_ahci_softc *sc)
 {
 	struct pci_devinst *pi;
 	int i;
 
 	pi = sc->asc_pi;
 
 	for (i = 0; i < sc->ports; i++) {
 		struct ahci_port *pr;
 		pr = &sc->port[i];
 		if (pr->is & pr->ie)
 			sc->is |= (1 << i);
 	}
 
 	DPRINTF("%s %x\n", __func__, sc->is);
 
 	if (sc->is && (sc->ghc & AHCI_GHC_IE)) {		
 		if (pci_msi_enabled(pi)) {
 			/*
 			 * Generate an MSI interrupt on every edge
 			 */
 			pci_generate_msi(pi, 0);
 		} else if (!sc->lintr) {
 			/*
 			 * Only generate a pin-based interrupt if one wasn't
 			 * in progress
 			 */
 			sc->lintr = 1;
 			pci_lintr_assert(pi);
 		}
 	} else if (sc->lintr) {
 		/*
 		 * No interrupts: deassert pin-based signal if it had
 		 * been asserted
 		 */
 		pci_lintr_deassert(pi);
 		sc->lintr = 0;
 	}
 }
 
 static void
 ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
 {
 	int offset, len, irq;
 
 	if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
 		return;
 
 	switch (ft) {
 	case FIS_TYPE_REGD2H:
 		offset = 0x40;
 		len = 20;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0;
 		break;
 	case FIS_TYPE_SETDEVBITS:
 		offset = 0x58;
 		len = 8;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0;
 		break;
 	case FIS_TYPE_PIOSETUP:
 		offset = 0x20;
 		len = 20;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0;
 		break;
 	default:
 		WPRINTF("unsupported fis type %d\n", ft);
 		return;
 	}
 	if (fis[2] & ATA_S_ERROR) {
 		p->waitforclear = 1;
 		irq |= AHCI_P_IX_TFE;
 	}
 	memcpy(p->rfis + offset, fis, len);
 	if (irq) {
 		p->is |= irq;
 		ahci_generate_intr(p->pr_sc);
 	}
 }
 
 static void
 ahci_write_fis_piosetup(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_PIOSETUP;
 	ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
 }
 
 static void
 ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[8];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	tfd &= 0x77;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_SETDEVBITS;
 	fis[1] = (1 << 6);
 	fis[2] = tfd;
 	fis[3] = error;
 	if (fis[2] & ATA_S_ERROR) {
 		p->err_cfis[0] = slot;
 		p->err_cfis[2] = tfd;
 		p->err_cfis[3] = error;
 		memcpy(&p->err_cfis[4], cfis + 4, 16);
 	} else {
 		*(uint32_t *)(fis + 4) = (1 << slot);
 		p->sact &= ~(1 << slot);
 	}
 	p->tfd &= ~0x77;
 	p->tfd |= tfd;
 	ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
 }
 
 static void
 ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[20];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = (1 << 6);
 	fis[2] = tfd & 0xff;
 	fis[3] = error;
 	fis[4] = cfis[4];
 	fis[5] = cfis[5];
 	fis[6] = cfis[6];
 	fis[7] = cfis[7];
 	fis[8] = cfis[8];
 	fis[9] = cfis[9];
 	fis[10] = cfis[10];
 	fis[11] = cfis[11];
 	fis[12] = cfis[12];
 	fis[13] = cfis[13];
 	if (fis[2] & ATA_S_ERROR) {
 		p->err_cfis[0] = 0x80;
 		p->err_cfis[2] = tfd & 0xff;
 		p->err_cfis[3] = error;
 		memcpy(&p->err_cfis[4], cfis + 4, 16);
 	} else
 		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot)
 {
 	uint8_t fis[20];
 
 	p->tfd = ATA_S_READY | ATA_S_DSC;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = 0;			/* No interrupt */
 	fis[2] = p->tfd;		/* Status */
 	fis[3] = 0;			/* No error */
 	p->ci &= ~(1 << slot);
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_reset_fis_d2h(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[3] = 1;
 	fis[4] = 1;
 	if (p->atapi) {
 		fis[5] = 0x14;
 		fis[6] = 0xeb;
 	}
 	fis[12] = 1;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_check_stopped(struct ahci_port *p)
 {
 	/*
 	 * If we are no longer processing the command list and nothing
 	 * is in-flight, clear the running bit, the current command
 	 * slot, the command issue and active bits.
 	 */
 	if (!(p->cmd & AHCI_P_CMD_ST)) {
 		if (p->pending == 0) {
 			p->ccs = 0;
 			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
 			p->ci = 0;
 			p->sact = 0;
 			p->waitforclear = 0;
 		}
 	}
 }
 
 static void
 ahci_port_stop(struct ahci_port *p)
 {
 	struct ahci_ioreq *aior;
 	uint8_t *cfis;
 	int slot;
 	int error;
 
 	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
 
 	TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
 		/*
 		 * Try to cancel the outstanding blockif request.
 		 */
 		error = blockif_cancel(p->bctx, &aior->io_req);
 		if (error != 0)
 			continue;
 
 		slot = aior->slot;
 		cfis = aior->cfis;
 		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 		    cfis[2] == ATA_READ_FPDMA_QUEUED ||
 		    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 			p->sact &= ~(1 << slot);	/* NCQ */
 		else
 			p->ci &= ~(1 << slot);
 
 		/*
 		 * This command is now done.
 		 */
 		p->pending &= ~(1 << slot);
 
 		/*
 		 * Delete the blockif request from the busy list
 		 */
 		TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 		/*
 		 * Move the blockif request back to the free list
 		 */
 		STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 	}
 
 	ahci_check_stopped(p);
 }
 
 static void
 ahci_port_reset(struct ahci_port *pr)
 {
 	pr->serr = 0;
 	pr->sact = 0;
 	pr->xfermode = ATA_UDMA6;
 	pr->mult_sectors = 128;
 
 	if (!pr->bctx) {
 		pr->ssts = ATA_SS_DET_NO_DEVICE;
 		pr->sig = 0xFFFFFFFF;
 		pr->tfd = 0x7F;
 		return;
 	}
 	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE;
 	if (pr->sctl & ATA_SC_SPD_MASK)
 		pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK);
 	else
 		pr->ssts |= ATA_SS_SPD_GEN3;
 	pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
 	if (!pr->atapi) {
 		pr->sig = PxSIG_ATA;
 		pr->tfd |= ATA_S_READY;
 	} else
 		pr->sig = PxSIG_ATAPI;
 	ahci_write_reset_fis_d2h(pr);
 }
 
 static void
 ahci_reset(struct pci_ahci_softc *sc)
 {
 	int i;
 
 	sc->ghc = AHCI_GHC_AE;
 	sc->is = 0;
 
 	if (sc->lintr) {
 		pci_lintr_deassert(sc->asc_pi);
 		sc->lintr = 0;
 	}
 
 	for (i = 0; i < sc->ports; i++) {
 		sc->port[i].ie = 0;
 		sc->port[i].is = 0;
 		sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD);
 		if (sc->port[i].bctx)
 			sc->port[i].cmd |= AHCI_P_CMD_CPS;
 		sc->port[i].sctl = 0;
 		ahci_port_reset(&sc->port[i]);
 	}
 }
 
 static void
 ata_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i ^ 1] = *src++;
 		else
 			dest[i ^ 1] = ' ';
 	}
 }
 
 static void
 atapi_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i] = *src++;
 		else
 			dest[i] = ' ';
 	}
 }
 
 /*
  * Build up the iovec based on the PRDT, 'done' and 'len'.
  */
 static void
 ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior,
     struct ahci_prdt_entry *prdt, uint16_t prdtl)
 {
 	struct blockif_req *breq = &aior->io_req;
 	int i, j, skip, todo, left, extra;
 	uint32_t dbcsz;
 
 	/* Copy part of PRDT between 'done' and 'len' bytes into the iov. */
 	skip = aior->done;
 	left = aior->len - aior->done;
 	todo = 0;
 	for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0;
 	    i++, prdt++) {
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		/* Skip already done part of the PRDT */
 		if (dbcsz <= skip) {
 			skip -= dbcsz;
 			continue;
 		}
 		dbcsz -= skip;
 		if (dbcsz > left)
 			dbcsz = left;
 		breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc),
 		    prdt->dba + skip, dbcsz);
 		breq->br_iov[j].iov_len = dbcsz;
 		todo += dbcsz;
 		left -= dbcsz;
 		skip = 0;
 		j++;
 	}
 
 	/* If we got limited by IOV length, round I/O down to sector size. */
 	if (j == BLOCKIF_IOV_MAX) {
 		extra = todo % blockif_sectsz(p->bctx);
 		todo -= extra;
 		assert(todo > 0);
 		while (extra > 0) {
 			if (breq->br_iov[j - 1].iov_len > extra) {
 				breq->br_iov[j - 1].iov_len -= extra;
 				break;
 			}
 			extra -= breq->br_iov[j - 1].iov_len;
 			j--;
 		}
 	}
 
 	breq->br_iovcnt = j;
 	breq->br_resid = todo;
 	aior->done += todo;
 	aior->more = (aior->done < aior->len && i < prdtl);
 }
 
 static void
 ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	struct ahci_prdt_entry *prdt;
 	struct ahci_cmd_hdr *hdr;
 	uint64_t lba;
 	uint32_t len;
 	int err, first, ncq, readop;
 
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	ncq = 0;
 	readop = 1;
 	first = (done == 0);
 
 	if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 ||
 	    cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 ||
 	    cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
 	    cfis[2] == ATA_WRITE_FPDMA_QUEUED)
 		readop = 0;
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 	    cfis[2] == ATA_READ_FPDMA_QUEUED) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[11] << 8 | cfis[3];
 		if (!len)
 			len = 65536;
 		ncq = 1;
 	} else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 ||
 	    cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 ||
 	    cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[13] << 8 | cfis[12];
 		if (!len)
 			len = 65536;
 	} else {
 		lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
 			(cfis[5] << 8) | cfis[4];
 		len = cfis[12];
 		if (!len)
 			len = 256;
 	}
 	lba *= blockif_sectsz(p->bctx);
 	len *= blockif_sectsz(p->bctx);
 
 	/* Pull request off free list */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
 	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
 	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	if (ncq && first)
 		ahci_write_fis_d2h_ncq(p, slot);
 
 	if (readop)
 		err = blockif_read(p->bctx, breq);
 	else
 		err = blockif_write(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	int err;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = 0;
 	aior->done = 0;
 	aior->more = 0;
 	breq = &aior->io_req;
 
 	/*
 	 * Mark this command in-flight.
 	 */
 	p->pending |= 1 << slot;
 
 	/*
 	 * Stuff request onto busy list
 	 */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	err = blockif_flush(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 read_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 		void *buf, int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	void *to;
 	int i, len;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	to = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
-		sublen = len < dbcsz ? len : dbcsz;
+		sublen = MIN(len, dbcsz);
 		memcpy(to, ptr, sublen);
 		len -= sublen;
 		to += sublen;
 		prdt++;
 	}
 }
 
 static void
 ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	uint8_t *entry;
 	uint64_t elba;
 	uint32_t len, elen;
 	int err, first, ncq;
 	uint8_t buf[512];
 
 	first = (done == 0);
 	if (cfis[2] == ATA_DATA_SET_MANAGEMENT) {
 		len = (uint16_t)cfis[13] << 8 | cfis[12];
 		len *= 512;
 		ncq = 0;
 	} else { /* ATA_SEND_FPDMA_QUEUED */
 		len = (uint16_t)cfis[11] << 8 | cfis[3];
 		len *= 512;
 		ncq = 1;
 	}
 	read_prdt(p, slot, cfis, buf, sizeof(buf));
 
 next:
 	entry = &buf[done];
 	elba = ((uint64_t)entry[5] << 40) |
 		((uint64_t)entry[4] << 32) |
 		((uint64_t)entry[3] << 24) |
 		((uint64_t)entry[2] << 16) |
 		((uint64_t)entry[1] << 8) |
 		entry[0];
 	elen = (uint16_t)entry[7] << 8 | entry[6];
 	done += 8;
 	if (elen == 0) {
 		if (done >= len) {
 			ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 			p->pending &= ~(1 << slot);
 			ahci_check_stopped(p);
 			if (!first)
 				ahci_handle_port(p);
 			return;
 		}
 		goto next;
 	}
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	aior->more = (len != done);
 
 	breq = &aior->io_req;
 	breq->br_offset = elba * blockif_sectsz(p->bctx);
 	breq->br_resid = elen * blockif_sectsz(p->bctx);
 
 	/*
 	 * Mark this command in-flight.
 	 */
 	p->pending |= 1 << slot;
 
 	/*
 	 * Stuff request onto busy list
 	 */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	if (ncq && first)
 		ahci_write_fis_d2h_ncq(p, slot);
 
 	err = blockif_delete(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 		void *buf, int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	void *from;
 	int i, len;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	from = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
-		sublen = len < dbcsz ? len : dbcsz;
+		sublen = MIN(len, dbcsz);
 		memcpy(ptr, from, sublen);
 		len -= sublen;
 		from += sublen;
 		prdt++;
 	}
 	hdr->prdbc = size - len;
 }
 
 static void
 ahci_checksum(uint8_t *buf, int size)
 {
 	int i;
 	uint8_t sum = 0;
 
 	for (i = 0; i < size - 1; i++)
 		sum += buf[i];
 	buf[size - 1] = 0x100 - sum;
 }
 
 static void
 ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 	uint8_t buf[512];
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0 || cfis[4] != 0x10 ||
 	    cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		return;
 	}
 
 	memset(buf, 0, sizeof(buf));
 	memcpy(buf, p->err_cfis, sizeof(p->err_cfis));
 	ahci_checksum(buf, sizeof(buf));
 
 	if (cfis[2] == ATA_READ_LOG_EXT)
 		ahci_write_fis_piosetup(p);
 	write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 }
 
 static void
 handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		uint16_t buf[256];
 		uint64_t sectors;
 		int sectsz, psectsz, psectoff, candelete, ro;
 		uint16_t cyl;
 		uint8_t sech, heads;
 
 		ro = blockif_is_ro(p->bctx);
 		candelete = blockif_candelete(p->bctx);
 		sectsz = blockif_sectsz(p->bctx);
 		sectors = blockif_size(p->bctx) / sectsz;
 		blockif_chs(p->bctx, &cyl, &heads, &sech);
 		blockif_psectsz(p->bctx, &psectsz, &psectoff);
 		memset(buf, 0, sizeof(buf));
 		buf[0] = 0x0040;
 		buf[1] = cyl;
 		buf[3] = heads;
 		buf[6] = sech;
 		ata_string((uint8_t *)(buf+10), p->ident, 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
 		buf[47] = (0x8000 | 128);
 		buf[48] = 0;
 		buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
 		buf[50] = (1 << 14);
 		buf[53] = (1 << 1 | 1 << 2);
 		if (p->mult_sectors)
 			buf[59] = (0x100 | p->mult_sectors);
 		if (sectors <= 0x0fffffff) {
 			buf[60] = sectors;
 			buf[61] = (sectors >> 16);
 		} else {
 			buf[60] = 0xffff;
 			buf[61] = 0x0fff;
 		}
 		buf[63] = 0x7;
 		if (p->xfermode & ATA_WDMA0)
 			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 0x3;
 		buf[65] = 120;
 		buf[66] = 120;
 		buf[67] = 120;
 		buf[68] = 120;
 		buf[69] = 0;
 		buf[75] = 31;
 		buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
 			   ATA_SUPPORT_NCQ);
 		buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
 			   (p->ssts & ATA_SS_SPD_MASK) >> 3);
 		buf[80] = 0x3f0;
 		buf[81] = 0x28;
 		buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
 			   ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
 		buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
 			   ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
 		buf[84] = (1 << 14);
 		buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
 			   ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
 		buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
 			   ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
 		buf[87] = (1 << 14);
 		buf[88] = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			buf[88] |= (1 << ((p->xfermode & 7) + 8));
 		buf[100] = sectors;
 		buf[101] = (sectors >> 16);
 		buf[102] = (sectors >> 32);
 		buf[103] = (sectors >> 48);
 		if (candelete && !ro) {
 			buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
 			buf[105] = 1;
 			buf[169] = ATA_SUPPORT_DSM_TRIM;
 		}
 		buf[106] = 0x4000;
 		buf[209] = 0x4000;
 		if (psectsz > sectsz) {
 			buf[106] |= 0x2000;
 			buf[106] |= ffsl(psectsz / sectsz) - 1;
 			buf[209] |= (psectoff / sectsz);
 		}
 		if (sectsz > 512) {
 			buf[106] |= 0x1000;
 			buf[117] = sectsz / 2;
 			buf[118] = ((sectsz / 2) >> 16);
 		}
 		buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
 		buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
 		buf[222] = 0x1020;
 		buf[255] = 0x00a5;
 		ahci_checksum((uint8_t *)buf, sizeof(buf));
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
 }
 
 static void
 handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	if (!p->atapi) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		uint16_t buf[256];
 
 		memset(buf, 0, sizeof(buf));
 		buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
 		ata_string((uint8_t *)(buf+10), p->ident, 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
 		buf[49] = (1 << 9 | 1 << 8);
 		buf[50] = (1 << 14 | 1);
 		buf[53] = (1 << 2 | 1 << 1);
 		buf[62] = 0x3f;
 		buf[63] = 7;
 		if (p->xfermode & ATA_WDMA0)
 			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 3;
 		buf[65] = 120;
 		buf[66] = 120;
 		buf[67] = 120;
 		buf[68] = 120;
 		buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
 		buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
 		buf[78] = (1 << 5);
 		buf[80] = 0x3f0;
 		buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
 			   ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		buf[83] = (1 << 14);
 		buf[84] = (1 << 14);
 		buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
 			   ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		buf[87] = (1 << 14);
 		buf[88] = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			buf[88] |= (1 << ((p->xfermode & 7) + 8));
 		buf[222] = 0x1020;
 		buf[255] = 0x00a5;
 		ahci_checksum((uint8_t *)buf, sizeof(buf));
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
 }
 
 static void
 atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[36];
 	uint8_t *acmd;
 	int len;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	if (acmd[1] & 1) {		/* VPD */
 		if (acmd[2] == 0) {	/* Supported VPD pages */
 			buf[0] = 0x05;
 			buf[1] = 0;
 			buf[2] = 0;
 			buf[3] = 1;
 			buf[4] = 0;
 			len = 4 + buf[3];
 		} else {
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 	} else {
 		buf[0] = 0x05;
 		buf[1] = 0x80;
 		buf[2] = 0x00;
 		buf[3] = 0x21;
 		buf[4] = 31;
 		buf[5] = 0;
 		buf[6] = 0;
 		buf[7] = 0;
 		atapi_string(buf + 8, "BHYVE", 8);
 		atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
 		atapi_string(buf + 32, "001", 4);
 		len = sizeof(buf);
 	}
 
 	if (len > acmd[4])
 		len = acmd[4];
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, len);
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[8];
 	uint64_t sectors;
 
 	sectors = blockif_size(p->bctx) / 2048;
 	be32enc(buf, sectors - 1);
 	be32enc(buf + 4, 2048);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint8_t format;
 	int len;
 
 	acmd = cfis + 0x40;
 
 	len = be16dec(acmd + 7);
 	format = acmd[9] >> 6;
 	switch (format) {
 	case 0:
 	{
 		int msf, size;
 		uint64_t sectors;
 		uint8_t start_track, buf[20], *bp;
 
 		msf = (acmd[1] >> 1) & 1;
 		start_track = acmd[6];
 		if (start_track > 1 && start_track != 0xaa) {
 			uint32_t tfd;
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 		if (start_track <= 1) {
 			*bp++ = 0;
 			*bp++ = 0x14;
 			*bp++ = 1;
 			*bp++ = 0;
 			if (msf) {
 				*bp++ = 0;
 				lba_to_msf(bp, 0);
 				bp += 3;
 			} else {
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 			}
 		}
 		*bp++ = 0;
 		*bp++ = 0x14;
 		*bp++ = 0xaa;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 1:
 	{
 		uint8_t buf[12];
 
 		memset(buf, 0, sizeof(buf));
 		buf[1] = 0xa;
 		buf[2] = 0x1;
 		buf[3] = 0x1;
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 2:
 	{
 		int msf, size;
 		uint64_t sectors;
 		uint8_t *bp, buf[50];
 
 		msf = (acmd[1] >> 1) & 1;
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa2;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, 0);
 			bp += 3;
 		} else {
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 		}
 
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	default:
 	{
 		uint32_t tfd;
 
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 		break;
 	}
 	}
 }
 
 static void
 atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[16];
 
 	memset(buf, 0, sizeof(buf));
 	buf[3] = 8;
 
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct blockif_req *breq;
 	uint8_t *acmd;
 	uint64_t lba;
 	uint32_t len;
 	int err;
 
 	acmd = cfis + 0x40;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 	lba = be32dec(acmd + 2);
 	if (acmd[0] == READ_10)
 		len = be16dec(acmd + 7);
 	else
 		len = be32dec(acmd + 6);
 	if (len == 0) {
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 	}
 	lba *= 2048;
 	len *= 2048;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
 	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
 	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	err = blockif_read(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[64];
 	uint8_t *acmd;
 	int len;
 
 	acmd = cfis + 0x40;
 	len = acmd[4];
 	if (len > sizeof(buf))
 		len = sizeof(buf);
 	memset(buf, 0, len);
 	buf[0] = 0x70 | (1 << 7);
 	buf[2] = p->sense_key;
 	buf[7] = 10;
 	buf[12] = p->asc;
 	write_prdt(p, slot, cfis, buf, len);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd = cfis + 0x40;
 	uint32_t tfd;
 
 	switch (acmd[4] & 3) {
 	case 0:
 	case 1:
 	case 3:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		tfd = ATA_S_READY | ATA_S_DSC;
 		break;
 	case 2:
 		/* TODO eject media */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x53;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 	uint8_t pc, code;
 	int len;
 
 	acmd = cfis + 0x40;
 	len = be16dec(acmd + 7);
 	pc = acmd[2] >> 6;
 	code = acmd[2] & 0x3f;
 
 	switch (pc) {
 	case 0:
 		switch (code) {
 		case MODEPAGE_RW_ERROR_RECOVERY:
 		{
 			uint8_t buf[16];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 16 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x01;
 			buf[9] = 16 - 10;
 			buf[11] = 0x05;
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		case MODEPAGE_CD_CAPABILITIES:
 		{
 			uint8_t buf[30];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 30 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x2A;
 			buf[9] = 30 - 10;
 			buf[10] = 0x08;
 			buf[12] = 0x71;
 			be16enc(&buf[18], 2);
 			be16enc(&buf[20], 512);
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		default:
 			goto error;
 			break;
 		}
 		break;
 	case 3:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x39;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 error:
 	case 1:
 	case 2:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_get_event_status_notification(struct ahci_port *p, int slot,
     uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	/* we don't support asynchronous operation */
 	if (!(acmd[1] & 1)) {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	} else {
 		uint8_t buf[8];
 		int len;
 
 		len = be16dec(acmd + 7);
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 
 		memset(buf, 0, sizeof(buf));
 		be16enc(buf, 8 - 2);
 		buf[2] = 0x04;
 		buf[3] = 0x10;
 		buf[5] = 0x02;
 		write_prdt(p, slot, cfis, buf, len);
 		tfd = ATA_S_READY | ATA_S_DSC;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 
 	acmd = cfis + 0x40;
 
 #ifdef AHCI_DEBUG
 	{
 		int i;
 		DPRINTF("ACMD:");
 		for (i = 0; i < 16; i++)
 			DPRINTF("%02x ", acmd[i]);
 		DPRINTF("\n");
 	}
 #endif
 
 	switch (acmd[0]) {
 	case TEST_UNIT_READY:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case INQUIRY:
 		atapi_inquiry(p, slot, cfis);
 		break;
 	case READ_CAPACITY:
 		atapi_read_capacity(p, slot, cfis);
 		break;
 	case PREVENT_ALLOW:
 		/* TODO */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case READ_TOC:
 		atapi_read_toc(p, slot, cfis);
 		break;
 	case REPORT_LUNS:
 		atapi_report_luns(p, slot, cfis);
 		break;
 	case READ_10:
 	case READ_12:
 		atapi_read(p, slot, cfis, 0);
 		break;
 	case REQUEST_SENSE:
 		atapi_request_sense(p, slot, cfis);
 		break;
 	case START_STOP_UNIT:
 		atapi_start_stop_unit(p, slot, cfis);
 		break;
 	case MODE_SENSE_10:
 		atapi_mode_sense(p, slot, cfis);
 		break;
 	case GET_EVENT_STATUS_NOTIFICATION:
 		atapi_get_event_status_notification(p, slot, cfis);
 		break;
 	default:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x20;
 		ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
 				ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 
 	p->tfd |= ATA_S_BUSY;
 	switch (cfis[2]) {
 	case ATA_ATA_IDENTIFY:
 		handle_identify(p, slot, cfis);
 		break;
 	case ATA_SETFEATURES:
 	{
 		switch (cfis[3]) {
 		case ATA_SF_ENAB_SATA_SF:
 			switch (cfis[12]) {
 			case ATA_SATA_SF_AN:
 				p->tfd = ATA_S_DSC | ATA_S_READY;
 				break;
 			default:
 				p->tfd = ATA_S_ERROR | ATA_S_READY;
 				p->tfd |= (ATA_ERROR_ABORT << 8);
 				break;
 			}
 			break;
 		case ATA_SF_ENAB_WCACHE:
 		case ATA_SF_DIS_WCACHE:
 		case ATA_SF_ENAB_RCACHE:
 		case ATA_SF_DIS_RCACHE:
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		case ATA_SF_SETXFER:
 		{
 			switch (cfis[12] & 0xf8) {
 			case ATA_PIO:
 			case ATA_PIO0:
 				break;
 			case ATA_WDMA0:
 			case ATA_UDMA0:
 				p->xfermode = (cfis[12] & 0x7);
 				break;
 			}
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		}
 		default:
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	}
 	case ATA_SET_MULTI:
 		if (cfis[12] != 0 &&
 			(cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 		} else {
 			p->mult_sectors = cfis[12];
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	case ATA_READ:
 	case ATA_WRITE:
 	case ATA_READ48:
 	case ATA_WRITE48:
 	case ATA_READ_MUL:
 	case ATA_WRITE_MUL:
 	case ATA_READ_MUL48:
 	case ATA_WRITE_MUL48:
 	case ATA_READ_DMA:
 	case ATA_WRITE_DMA:
 	case ATA_READ_DMA48:
 	case ATA_WRITE_DMA48:
 	case ATA_READ_FPDMA_QUEUED:
 	case ATA_WRITE_FPDMA_QUEUED:
 		ahci_handle_rw(p, slot, cfis, 0);
 		break;
 	case ATA_FLUSHCACHE:
 	case ATA_FLUSHCACHE48:
 		ahci_handle_flush(p, slot, cfis);
 		break;
 	case ATA_DATA_SET_MANAGEMENT:
 		if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM &&
 		    cfis[13] == 0 && cfis[12] == 1) {
 			ahci_handle_dsm_trim(p, slot, cfis, 0);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_SEND_FPDMA_QUEUED:
 		if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM &&
 		    cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM &&
 		    cfis[11] == 0 && cfis[13] == 1) {
 			ahci_handle_dsm_trim(p, slot, cfis, 0);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_READ_LOG_EXT:
 	case ATA_READ_LOG_DMA_EXT:
 		ahci_handle_read_log(p, slot, cfis);
 		break;
 	case ATA_SECURITY_FREEZE_LOCK:
 	case ATA_SMART_CMD:
 	case ATA_NOP:
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_CHECK_POWER_MODE:
 		cfis[12] = 0xff;	/* always on */
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_STANDBY_CMD:
 	case ATA_STANDBY_IMMEDIATE:
 	case ATA_IDLE_CMD:
 	case ATA_IDLE_IMMEDIATE:
 	case ATA_SLEEP:
 	case ATA_READ_VERIFY:
 	case ATA_READ_VERIFY48:
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_ATAPI_IDENTIFY:
 		handle_atapi_identify(p, slot, cfis);
 		break;
 	case ATA_PACKET_CMD:
 		if (!p->atapi) {
 			ahci_write_fis_d2h(p, slot, cfis,
 			    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		} else
 			handle_packet_cmd(p, slot, cfis);
 		break;
 	default:
 		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_slot(struct ahci_port *p, int slot)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	int cfl;
 
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	cfl = (hdr->flags & 0x1f) * 4;
 	cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
 			0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 #ifdef AHCI_DEBUG
 	DPRINTF("\ncfis:");
 	for (i = 0; i < cfl; i++) {
 		if (i % 10 == 0)
 			DPRINTF("\n");
 		DPRINTF("%02x ", cfis[i]);
 	}
 	DPRINTF("\n");
 
 	for (i = 0; i < hdr->prdtl; i++) {
 		DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba);
 		prdt++;
 	}
 #endif
 
 	if (cfis[0] != FIS_TYPE_REGH2D) {
 		WPRINTF("Not a H2D FIS:%02x\n", cfis[0]);
 		return;
 	}
 
 	if (cfis[1] & 0x80) {
 		ahci_handle_cmd(p, slot, cfis);
 	} else {
 		if (cfis[15] & (1 << 2))
 			p->reset = 1;
 		else if (p->reset) {
 			p->reset = 0;
 			ahci_port_reset(p);
 		}
 		p->ci &= ~(1 << slot);
 	}
 }
 
 static void
 ahci_handle_port(struct ahci_port *p)
 {
 
 	if (!(p->cmd & AHCI_P_CMD_ST))
 		return;
 
 	/*
 	 * Search for any new commands to issue ignoring those that
 	 * are already in-flight.  Stop if device is busy or in error.
 	 */
 	for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) {
 		if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0)
 			break;
 		if (p->waitforclear)
 			break;
 		if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) {
 			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
 			p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT;
 			ahci_handle_slot(p, p->ccs);
 		}
 	}
 }
 
 /*
  * blockif callback routine - this runs in the context of the blockif
  * i/o thread, so the mutex needs to be acquired.
  */
 static void
 ata_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint32_t tfd;
 	uint8_t *cfis;
 	int slot, ncq, dsm;
 
 	DPRINTF("%s %d\n", __func__, err);
 
 	ncq = dsm = 0;
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 	    cfis[2] == ATA_READ_FPDMA_QUEUED ||
 	    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 		ncq = 1;
 	if (cfis[2] == ATA_DATA_SET_MANAGEMENT ||
 	    (cfis[2] == ATA_SEND_FPDMA_QUEUED &&
 	     (cfis[13] & 0x1f) == ATA_SFPDMA_DSM))
 		dsm = 1;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Delete the blockif request from the busy list
 	 */
 	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 	/*
 	 * Move the blockif request back to the free list
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (!err)
 		hdr->prdbc = aior->done;
 
 	if (!err && aior->more) {
 		if (dsm)
 			ahci_handle_dsm_trim(p, slot, cfis, aior->done);
 		else 
 			ahci_handle_rw(p, slot, cfis, aior->done);
 		goto out;
 	}
 
 	if (!err)
 		tfd = ATA_S_READY | ATA_S_DSC;
 	else
 		tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 	if (ncq)
 		ahci_write_fis_sdb(p, slot, cfis, tfd);
 	else
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
 	 * This command is now complete.
 	 */
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
 	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
 }
 
 static void
 atapi_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	uint32_t tfd;
 	int slot;
 
 	DPRINTF("%s %d\n", __func__, err);
 
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Delete the blockif request from the busy list
 	 */
 	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 	/*
 	 * Move the blockif request back to the free list
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (!err)
 		hdr->prdbc = aior->done;
 
 	if (!err && aior->more) {
 		atapi_read(p, slot, cfis, aior->done);
 		goto out;
 	}
 
 	if (!err) {
 		tfd = ATA_S_READY | ATA_S_DSC;
 	} else {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x21;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
 	 * This command is now complete.
 	 */
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
 	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
 }
 
 static void
 pci_ahci_ioreq_init(struct ahci_port *pr)
 {
 	struct ahci_ioreq *vr;
 	int i;
 
 	pr->ioqsz = blockif_queuesz(pr->bctx);
 	pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
 	STAILQ_INIT(&pr->iofhd);
 
 	/*
 	 * Add all i/o request entries to the free queue
 	 */
 	for (i = 0; i < pr->ioqsz; i++) {
 		vr = &pr->ioreq[i];
 		vr->io_pr = pr;
 		if (!pr->atapi)
 			vr->io_req.br_callback = ata_ioreq_cb;
 		else
 			vr->io_req.br_callback = atapi_ioreq_cb;
 		vr->io_req.br_param = vr;
 		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
 	}
 
 	TAILQ_INIT(&pr->iobhd);
 }
 
 static void
 pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 	struct ahci_port *p = &sc->port[port];
 
 	DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
 		port, offset, value);
 
 	switch (offset) {
 	case AHCI_P_CLB:
 		p->clb = value;
 		break;
 	case AHCI_P_CLBU:
 		p->clbu = value;
 		break;
 	case AHCI_P_FB:
 		p->fb = value;
 		break;
 	case AHCI_P_FBU:
 		p->fbu = value;
 		break;
 	case AHCI_P_IS:
 		p->is &= ~value;
 		break;
 	case AHCI_P_IE:
 		p->ie = value & 0xFDC000FF;
 		ahci_generate_intr(sc);
 		break;
 	case AHCI_P_CMD:
 	{
 		p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
 		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
 		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
 		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK);
 		p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
 		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
 		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
 		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value;
 
 		if (!(value & AHCI_P_CMD_ST)) {
 			ahci_port_stop(p);
 		} else {
 			uint64_t clb;
 
 			p->cmd |= AHCI_P_CMD_CR;
 			clb = (uint64_t)p->clbu << 32 | p->clb;
 			p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
 					AHCI_CL_SIZE * AHCI_MAX_SLOTS);
 		}
 
 		if (value & AHCI_P_CMD_FRE) {
 			uint64_t fb;
 
 			p->cmd |= AHCI_P_CMD_FR;
 			fb = (uint64_t)p->fbu << 32 | p->fb;
 			/* we don't support FBSCP, so rfis size is 256Bytes */
 			p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
 		} else {
 			p->cmd &= ~AHCI_P_CMD_FR;
 		}
 
 		if (value & AHCI_P_CMD_CLO) {
 			p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ);
 			p->cmd &= ~AHCI_P_CMD_CLO;
 		}
 
 		if (value & AHCI_P_CMD_ICC_MASK) {
 			p->cmd &= ~AHCI_P_CMD_ICC_MASK;
 		}
 
 		ahci_handle_port(p);
 		break;
 	}
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 		WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_P_SCTL:
 		p->sctl = value;
 		if (!(p->cmd & AHCI_P_CMD_ST)) {
 			if (value & ATA_SC_DET_RESET)
 				ahci_port_reset(p);
 		}
 		break;
 	case AHCI_P_SERR:
 		p->serr &= ~value;
 		break;
 	case AHCI_P_SACT:
 		p->sact |= value;
 		break;
 	case AHCI_P_CI:
 		p->ci |= value;
 		ahci_handle_port(p);
 		break;
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
 		offset, value);
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CAP2:
 		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_GHC:
 		if (value & AHCI_GHC_HR)
 			ahci_reset(sc);
 		else if (value & AHCI_GHC_IE) {
 			sc->ghc |= AHCI_GHC_IE;
 			ahci_generate_intr(sc);
 		}
 		break;
 	case AHCI_IS:
 		sc->is &= ~value;
 		ahci_generate_intr(sc);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 
 	assert(baridx == 5);
 	assert((offset % 4) == 0 && size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	if (offset < AHCI_OFFSET)
 		pci_ahci_host_write(sc, offset, value);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		pci_ahci_port_write(sc, offset, value);
 	else
 		WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset);
 
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static uint64_t
 pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_GHC:
 	case AHCI_IS:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CCCC:
 	case AHCI_CCCP:
 	case AHCI_EM_LOC:
 	case AHCI_EM_CTL:
 	case AHCI_CAP2:
 	{
 		uint32_t *p = &sc->cap;
 		p += (offset - AHCI_CAP) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 	DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n",
 		offset, value);
 
 	return (value);
 }
 
 static uint64_t
 pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 
 	switch (offset) {
 	case AHCI_P_CLB:
 	case AHCI_P_CLBU:
 	case AHCI_P_FB:
 	case AHCI_P_FBU:
 	case AHCI_P_IS:
 	case AHCI_P_IE:
 	case AHCI_P_CMD:
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 	case AHCI_P_SCTL:
 	case AHCI_P_SERR:
 	case AHCI_P_SACT:
 	case AHCI_P_CI:
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	{
 		uint32_t *p= &sc->port[port].clb;
 		p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 
 	DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n",
 		port, offset, value);
 
 	return value;
 }
 
 static uint64_t
 pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
     uint64_t regoff, int size)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 	uint64_t offset;
 	uint32_t value;
 
 	assert(baridx == 5);
 	assert(size == 1 || size == 2 || size == 4);
 	assert((regoff & (size - 1)) == 0);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	offset = regoff & ~0x3;	    /* round down to a multiple of 4 bytes */
 	if (offset < AHCI_OFFSET)
 		value = pci_ahci_host_read(sc, offset);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		value = pci_ahci_port_read(sc, offset);
 	else {
 		value = 0;
 		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n",
 		    regoff);
 	}
 	value >>= 8 * (regoff & 0x3);
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (value);
 }
 
 static int
 pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 {
 	char bident[sizeof("XX:X:X")];
 	struct blockif_ctxt *bctxt;
 	struct pci_ahci_softc *sc;
 	int ret, slots;
 	MD5_CTX mdctx;
 	u_char digest[16];
 
 	ret = 0;
 
 	if (opts == NULL) {
 		fprintf(stderr, "pci_ahci: backing device required\n");
 		return (1);
 	}
 
 #ifdef AHCI_DEBUG
 	dbg = fopen("/tmp/log", "w+");
 #endif
 
 	sc = calloc(1, sizeof(struct pci_ahci_softc));
 	pi->pi_arg = sc;
 	sc->asc_pi = pi;
 	sc->ports = MAX_PORTS;
 
 	/*
 	 * Only use port 0 for a backing device. All other ports will be
 	 * marked as unused
 	 */
 	sc->port[0].atapi = atapi;
 
 	/*
 	 * Attempt to open the backing image. Use the PCI
 	 * slot/func for the identifier string.
 	 */
 	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
 	bctxt = blockif_open(opts, bident);
 	if (bctxt == NULL) {       	
 		ret = 1;
 		goto open_fail;
 	}	
 	sc->port[0].bctx = bctxt;
 	sc->port[0].pr_sc = sc;
 
 	/*
 	 * Create an identifier for the backing file. Use parts of the
 	 * md5 sum of the filename
 	 */
 	MD5Init(&mdctx);
 	MD5Update(&mdctx, opts, strlen(opts));
 	MD5Final(digest, &mdctx);	
 	sprintf(sc->port[0].ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
 
 	/*
 	 * Allocate blockif request structures and add them
 	 * to the free list
 	 */
 	pci_ahci_ioreq_init(&sc->port[0]);
 
 	pthread_mutex_init(&sc->mtx, NULL);
 
 	/* Intel ICH8 AHCI */
 	slots = sc->port[0].ioqsz;
 	if (slots > 32)
 		slots = 32;
 	--slots;
 	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
 	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
 	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
 	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
 	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
 
 	/* Only port 0 implemented */
 	sc->pi = 1;
 	sc->vs = 0x10300;
 	sc->cap2 = AHCI_CAP2_APST;
 	ahci_reset(sc);
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
 	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
 	pci_emul_add_msicap(pi, 1);
 	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
 	    AHCI_OFFSET + sc->ports * AHCI_STEP);
 
 	pci_lintr_request(pi);
 
 open_fail:
 	if (ret) {
 		if (sc->port[0].bctx != NULL)
 			blockif_close(sc->port[0].bctx);
 		free(sc);
 	}
 
 	return (ret);
 }
 
 static int
 pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	return (pci_ahci_init(ctx, pi, opts, 0));
 }
 
 static int
 pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	return (pci_ahci_init(ctx, pi, opts, 1));
 }
 
 /*
  * Use separate emulation names to distinguish drive and atapi devices
  */
 struct pci_devemu pci_de_ahci_hd = {
 	.pe_emu =	"ahci-hd",
 	.pe_init =	pci_ahci_hd_init,
 	.pe_barwrite =	pci_ahci_write,
 	.pe_barread =	pci_ahci_read
 };
 PCI_EMUL_SET(pci_de_ahci_hd);
 
 struct pci_devemu pci_de_ahci_cd = {
 	.pe_emu =	"ahci-cd",
 	.pe_init =	pci_ahci_atapi_init,
 	.pe_barwrite =	pci_ahci_write,
 	.pe_barread =	pci_ahci_read
 };
 PCI_EMUL_SET(pci_de_ahci_cd);
Index: user/ngie/bsnmp_cleanup
===================================================================
--- user/ngie/bsnmp_cleanup	(revision 298467)
+++ user/ngie/bsnmp_cleanup	(revision 298468)

Property changes on: user/ngie/bsnmp_cleanup
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r298453-298467